{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 找工作数据处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import sys\n",
    "import os\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "sys.path.append(\"../\")\n",
    "import Utils.utils as utils\n",
    "from Utils.utils import HelloWorld"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>enterpriseId</th>\n",
       "      <th>logo</th>\n",
       "      <th>shortName</th>\n",
       "      <th>industry</th>\n",
       "      <th>econKind</th>\n",
       "      <th>startDate</th>\n",
       "      <th>registCapi</th>\n",
       "      <th>personScope</th>\n",
       "      <th>website</th>\n",
       "      <th>...</th>\n",
       "      <th>photo</th>\n",
       "      <th>label</th>\n",
       "      <th>postCode</th>\n",
       "      <th>recruitJobNum</th>\n",
       "      <th>totalPublicJobNum</th>\n",
       "      <th>provinceCode</th>\n",
       "      <th>cityCode</th>\n",
       "      <th>regionCode</th>\n",
       "      <th>detailedAddress</th>\n",
       "      <th>remarks</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1561969174736142336</td>\n",
       "      <td>1561969173914058752</td>\n",
       "      <td>/91440300MA5GKTA37A/1661238138156/166123813815...</td>\n",
       "      <td>漪畔网络</td>\n",
       "      <td>[\"互联网\",\"大数据\"]</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100-200万</td>\n",
       "      <td>少于50人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>广东省广州市</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1507193335067049984</td>\n",
       "      <td>1507192012301008896</td>\n",
       "      <td>/admin/fd700d14-9b0c-18bc-ae22-3ade75207d54/fd...</td>\n",
       "      <td>海柔创新科技</td>\n",
       "      <td>[\"电子商务\",\"互联网\"]</td>\n",
       "      <td>合资</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>150-500人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>440000.0</td>\n",
       "      <td>440300.0</td>\n",
       "      <td>440306.0</td>\n",
       "      <td>西乡街道南昌社区安络科技产业园B栋201、301、401</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1648526959162036224</td>\n",
       "      <td>1648526958960709632</td>\n",
       "      <td>/admin/1e4afb8d-bea7-aeed-a65e-687d6da3a532/1e...</td>\n",
       "      <td>众云网</td>\n",
       "      <td>[\"互联网\",\"计算机软件\"]</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>150-500人</td>\n",
       "      <td>http://www.ouryun.com.cn/</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>440000.0</td>\n",
       "      <td>440300.0</td>\n",
       "      <td>440305.0</td>\n",
       "      <td>深圳市南山区粤海街道滨海社区海天一路19、17、18号软件产业基地4栋511</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1562373713612111872</td>\n",
       "      <td>1562373712110551040</td>\n",
       "      <td>/914403006188644414/1661334068655/166133406865...</td>\n",
       "      <td>中银信息</td>\n",
       "      <td>[\"互联网\",\"大数据\"]</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>150-500人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>广东省广州市</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1648164336000176128</td>\n",
       "      <td>1648164335723352064</td>\n",
       "      <td>/admin/8fa85631-bfc8-7a0c-ad73-1cf6355aac59/8f...</td>\n",
       "      <td>森羽网络</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>50-100人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>370000.0</td>\n",
       "      <td>370800.0</td>\n",
       "      <td>370811.0</td>\n",
       "      <td>山东省济宁市高新区洸河街道百丰大厦803</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>558</th>\n",
       "      <td>1505828901006475264</td>\n",
       "      <td>1505827247335014400</td>\n",
       "      <td>/佛山市小羽科技有限公司/4154-16478520jpg/16478520.jpg</td>\n",
       "      <td>佛山小羽智能科技</td>\n",
       "      <td>[\"电子商务\",\"互联网\"]</td>\n",
       "      <td>合资</td>\n",
       "      <td>NaN</td>\n",
       "      <td>200-500万</td>\n",
       "      <td>50-100人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>440000.0</td>\n",
       "      <td>440600.0</td>\n",
       "      <td>440605.0</td>\n",
       "      <td>狮山镇软件园桃园路南海产业智库城一期B座B611室之二(住所申报)</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>559</th>\n",
       "      <td>1476026480080322653</td>\n",
       "      <td>1476026460258041857</td>\n",
       "      <td>NaN</td>\n",
       "      <td>朗新科技</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>合资</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>50-100人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>560</th>\n",
       "      <td>1491292954646675456</td>\n",
       "      <td>1491291815121387520</td>\n",
       "      <td>/泰盈科技/10142-logojpg/logo.jpg</td>\n",
       "      <td>泰盈科技</td>\n",
       "      <td>[\"人力资源服务\",\"企业服务\"]</td>\n",
       "      <td>合资</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>150-500人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>440000.0</td>\n",
       "      <td>440600.0</td>\n",
       "      <td>440605.0</td>\n",
       "      <td>桂城街道融和路25号荣耀国际金融中心4层401-408室(住所申报)</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>561</th>\n",
       "      <td>1518408400017424384</td>\n",
       "      <td>1518408399195340800</td>\n",
       "      <td>/91310000734084709Q/1650852186157/165085218615...</td>\n",
       "      <td>鼎捷软件</td>\n",
       "      <td>[\"互联网\",\"金融\"]</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>500-1000人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>广东省广州市</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>562</th>\n",
       "      <td>1493064051390021632</td>\n",
       "      <td>1493063267566878720</td>\n",
       "      <td>/京东信息/43591-164480911png/164480911.png</td>\n",
       "      <td>京东信息</td>\n",
       "      <td>[\"电子商务\",\"互联网\"]</td>\n",
       "      <td>上市公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>10000人以上</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>320000.0</td>\n",
       "      <td>321300.0</td>\n",
       "      <td>321311.0</td>\n",
       "      <td>洪泽湖东路与清水江路交叉口</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>563 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                      id         enterpriseId  \\\n",
       "0    1561969174736142336  1561969173914058752   \n",
       "1    1507193335067049984  1507192012301008896   \n",
       "2    1648526959162036224  1648526958960709632   \n",
       "3    1562373713612111872  1562373712110551040   \n",
       "4    1648164336000176128  1648164335723352064   \n",
       "..                   ...                  ...   \n",
       "558  1505828901006475264  1505827247335014400   \n",
       "559  1476026480080322653  1476026460258041857   \n",
       "560  1491292954646675456  1491291815121387520   \n",
       "561  1518408400017424384  1518408399195340800   \n",
       "562  1493064051390021632  1493063267566878720   \n",
       "\n",
       "                                                  logo shortName  \\\n",
       "0    /91440300MA5GKTA37A/1661238138156/166123813815...      漪畔网络   \n",
       "1    /admin/fd700d14-9b0c-18bc-ae22-3ade75207d54/fd...    海柔创新科技   \n",
       "2    /admin/1e4afb8d-bea7-aeed-a65e-687d6da3a532/1e...       众云网   \n",
       "3    /914403006188644414/1661334068655/166133406865...      中银信息   \n",
       "4    /admin/8fa85631-bfc8-7a0c-ad73-1cf6355aac59/8f...      森羽网络   \n",
       "..                                                 ...       ...   \n",
       "558         /佛山市小羽科技有限公司/4154-16478520jpg/16478520.jpg  佛山小羽智能科技   \n",
       "559                                                NaN      朗新科技   \n",
       "560                       /泰盈科技/10142-logojpg/logo.jpg      泰盈科技   \n",
       "561  /91310000734084709Q/1650852186157/165085218615...      鼎捷软件   \n",
       "562             /京东信息/43591-164480911png/164480911.png      京东信息   \n",
       "\n",
       "              industry econKind  startDate registCapi personScope  \\\n",
       "0        [\"互联网\",\"大数据\"]     民营公司        NaN   100-200万       少于50人   \n",
       "1       [\"电子商务\",\"互联网\"]       合资        NaN    1000万以上    150-500人   \n",
       "2      [\"互联网\",\"计算机软件\"]     民营公司        NaN        NaN    150-500人   \n",
       "3        [\"互联网\",\"大数据\"]     民营公司        NaN    1000万以上    150-500人   \n",
       "4              [\"互联网\"]     民营公司        NaN        NaN     50-100人   \n",
       "..                 ...      ...        ...        ...         ...   \n",
       "558     [\"电子商务\",\"互联网\"]       合资        NaN   200-500万     50-100人   \n",
       "559            [\"互联网\"]       合资        NaN        NaN     50-100人   \n",
       "560  [\"人力资源服务\",\"企业服务\"]       合资        NaN    1000万以上    150-500人   \n",
       "561       [\"互联网\",\"金融\"]     民营公司        NaN    1000万以上   500-1000人   \n",
       "562     [\"电子商务\",\"互联网\"]     上市公司        NaN    1000万以上    10000人以上   \n",
       "\n",
       "                       website  ... photo label  postCode recruitJobNum  \\\n",
       "0                          NaN  ...   NaN   NaN       NaN             3   \n",
       "1                          NaN  ...   NaN   NaN       NaN             3   \n",
       "2    http://www.ouryun.com.cn/  ...   NaN    []       NaN             3   \n",
       "3                          NaN  ...   NaN   NaN       NaN             2   \n",
       "4                          NaN  ...   NaN    []       NaN             4   \n",
       "..                         ...  ...   ...   ...       ...           ...   \n",
       "558                        NaN  ...   NaN   NaN       NaN             5   \n",
       "559                        NaN  ...   NaN   NaN       NaN             2   \n",
       "560                        NaN  ...   NaN   NaN       NaN             2   \n",
       "561                        NaN  ...   NaN   NaN       NaN             1   \n",
       "562                        NaN  ...   NaN   NaN       NaN             1   \n",
       "\n",
       "    totalPublicJobNum provinceCode  cityCode  regionCode  \\\n",
       "0                   3          NaN       NaN         NaN   \n",
       "1                   3     440000.0  440300.0    440306.0   \n",
       "2                   3     440000.0  440300.0    440305.0   \n",
       "3                   2          NaN       NaN         NaN   \n",
       "4                   4     370000.0  370800.0    370811.0   \n",
       "..                ...          ...       ...         ...   \n",
       "558                 5     440000.0  440600.0    440605.0   \n",
       "559                 2          NaN       NaN         NaN   \n",
       "560                 2     440000.0  440600.0    440605.0   \n",
       "561                 1          NaN       NaN         NaN   \n",
       "562                 1     320000.0  321300.0    321311.0   \n",
       "\n",
       "                            detailedAddress  remarks  \n",
       "0                                    广东省广州市      NaN  \n",
       "1              西乡街道南昌社区安络科技产业园B栋201、301、401        0  \n",
       "2    深圳市南山区粤海街道滨海社区海天一路19、17、18号软件产业基地4栋511      NaN  \n",
       "3                                    广东省广州市      NaN  \n",
       "4                      山东省济宁市高新区洸河街道百丰大厦803      NaN  \n",
       "..                                      ...      ...  \n",
       "558       狮山镇软件园桃园路南海产业智库城一期B座B611室之二(住所申报)        0  \n",
       "559                                     NaN      NaN  \n",
       "560      桂城街道融和路25号荣耀国际金融中心4层401-408室(住所申报)        0  \n",
       "561                                  广东省广州市      NaN  \n",
       "562                           洪泽湖东路与清水江路交叉口        0  \n",
       "\n",
       "[563 rows x 24 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Job = pd.read_csv(\"../Data/OriginData/FindJob.csv\")\n",
    "JobDetail = pd.read_csv(\"../Data/OriginData/FindJobDetail.csv\")\n",
    "CompanyDetail = pd.read_csv(\"../Data/OriginData/CompanyDetail.csv\")\n",
    "CompanyDetail"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据总体查看"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1581, 1575)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(Job),len(JobDetail)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "发现数据条目不对，查看Job_Detail缺失的数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "missing_data = set(Job.jobId.values).difference(set(JobDetail.jobId.values))\n",
    "# import requests\n",
    "# for i in missing_data:\n",
    "#     r = requests.get(f\"https://www.5iai.com/api/enterprise/job/public?id={i}\",proxies={\"https\":\"http://localhost:7890\"}).json()\n",
    "#     print(r)\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "发现并没有数据进行返回，只返回了状态码；\n",
    "\n",
    "接下来查看Job中的数据是怎么样的"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>jobId</th>\n",
       "      <th>publishTime</th>\n",
       "      <th>updateTime</th>\n",
       "      <th>willNature</th>\n",
       "      <th>positionName</th>\n",
       "      <th>minimumWage</th>\n",
       "      <th>maximumWage</th>\n",
       "      <th>payMethod</th>\n",
       "      <th>exp</th>\n",
       "      <th>educationalRequirements</th>\n",
       "      <th>count</th>\n",
       "      <th>enterpriseId</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>1551740730014302209</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-07-26 09:26</td>\n",
       "      <td>2</td>\n",
       "      <td>NLP算法研究员</td>\n",
       "      <td>26000</td>\n",
       "      <td>44000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>1549225204755070976</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>86</th>\n",
       "      <td>1551739682566569984</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-07-26 09:22</td>\n",
       "      <td>2</td>\n",
       "      <td>数据处理工程师</td>\n",
       "      <td>29000</td>\n",
       "      <td>35000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1549225204755070976</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87</th>\n",
       "      <td>1551739421710221313</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-07-26 09:21</td>\n",
       "      <td>2</td>\n",
       "      <td>NLP算法研究员</td>\n",
       "      <td>26000</td>\n",
       "      <td>44000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>1549225204755070976</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>88</th>\n",
       "      <td>1551739421710221312</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-07-26 09:21</td>\n",
       "      <td>2</td>\n",
       "      <td>数据处理工程师</td>\n",
       "      <td>29000</td>\n",
       "      <td>35000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1549225204755070976</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96</th>\n",
       "      <td>1551740730014302208</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-07-26 09:26</td>\n",
       "      <td>2</td>\n",
       "      <td>数据处理工程师</td>\n",
       "      <td>29000</td>\n",
       "      <td>35000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1549225204755070976</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>1551739682566569985</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-07-26 09:22</td>\n",
       "      <td>2</td>\n",
       "      <td>NLP算法研究员</td>\n",
       "      <td>26000</td>\n",
       "      <td>44000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>1549225204755070976</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  jobId publishTime        updateTime  willNature  \\\n",
       "85  1551740730014302209         NaN  2022-07-26 09:26           2   \n",
       "86  1551739682566569984         NaN  2022-07-26 09:22           2   \n",
       "87  1551739421710221313         NaN  2022-07-26 09:21           2   \n",
       "88  1551739421710221312         NaN  2022-07-26 09:21           2   \n",
       "96  1551740730014302208         NaN  2022-07-26 09:26           2   \n",
       "97  1551739682566569985         NaN  2022-07-26 09:22           2   \n",
       "\n",
       "   positionName  minimumWage  maximumWage  payMethod   exp  \\\n",
       "85     NLP算法研究员        26000        44000          1  经验不限   \n",
       "86      数据处理工程师        29000        35000          1  经验不限   \n",
       "87     NLP算法研究员        26000        44000          1  经验不限   \n",
       "88      数据处理工程师        29000        35000          1  经验不限   \n",
       "96      数据处理工程师        29000        35000          1  经验不限   \n",
       "97     NLP算法研究员        26000        44000          1  经验不限   \n",
       "\n",
       "    educationalRequirements  count         enterpriseId  \n",
       "85                        3      3  1549225204755070976  \n",
       "86                        3      2  1549225204755070976  \n",
       "87                        3      3  1549225204755070976  \n",
       "88                        3      2  1549225204755070976  \n",
       "96                        3      2  1549225204755070976  \n",
       "97                        3      3  1549225204755070976  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Job[Job['jobId'].map(lambda x :  x in list(missing_data))]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "可以看到是同一家公司的，当我们将jobId放入网站时，发现页面所显示的信息绝大多是都是空的。因此在这里将这些数据删除掉"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据清洗——Job"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.去除上面所述的数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>jobId</th>\n",
       "      <th>publishTime</th>\n",
       "      <th>updateTime</th>\n",
       "      <th>willNature</th>\n",
       "      <th>positionName</th>\n",
       "      <th>minimumWage</th>\n",
       "      <th>maximumWage</th>\n",
       "      <th>payMethod</th>\n",
       "      <th>exp</th>\n",
       "      <th>educationalRequirements</th>\n",
       "      <th>count</th>\n",
       "      <th>enterpriseId</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1648527394191052802</td>\n",
       "      <td>2023-04-19 11:25</td>\n",
       "      <td>2023-04-19 11:22</td>\n",
       "      <td>2</td>\n",
       "      <td>售前技术支持</td>\n",
       "      <td>12000</td>\n",
       "      <td>20000</td>\n",
       "      <td>1</td>\n",
       "      <td>不限</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>1648526958960709632</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1648527394191052801</td>\n",
       "      <td>2023-04-19 11:26</td>\n",
       "      <td>2023-04-19 11:22</td>\n",
       "      <td>2</td>\n",
       "      <td>数据安全高级经理</td>\n",
       "      <td>16000</td>\n",
       "      <td>26000</td>\n",
       "      <td>1</td>\n",
       "      <td>不限</td>\n",
       "      <td>3</td>\n",
       "      <td>10</td>\n",
       "      <td>1648526958960709632</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1648527394191052800</td>\n",
       "      <td>2023-04-19 11:27</td>\n",
       "      <td>2023-04-19 11:22</td>\n",
       "      <td>2</td>\n",
       "      <td>数据安全项目经理</td>\n",
       "      <td>15000</td>\n",
       "      <td>25000</td>\n",
       "      <td>1</td>\n",
       "      <td>不限</td>\n",
       "      <td>3</td>\n",
       "      <td>100</td>\n",
       "      <td>1648526958960709632</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1648165203084447745</td>\n",
       "      <td>2023-04-18 11:35</td>\n",
       "      <td>2023-04-18 11:23</td>\n",
       "      <td>2</td>\n",
       "      <td>数据挖掘工程师</td>\n",
       "      <td>5000</td>\n",
       "      <td>8000</td>\n",
       "      <td>1</td>\n",
       "      <td>不限</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>1648164335723352064</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1648165203084447744</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2023-04-18 11:23</td>\n",
       "      <td>2</td>\n",
       "      <td>数据管理</td>\n",
       "      <td>3500</td>\n",
       "      <td>6000</td>\n",
       "      <td>1</td>\n",
       "      <td>不限</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>1648164335723352064</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1570</th>\n",
       "      <td>1461591923750993920</td>\n",
       "      <td>2021-12-08 11:14</td>\n",
       "      <td>2021-12-08 11:14</td>\n",
       "      <td>2</td>\n",
       "      <td>数据分析工程师</td>\n",
       "      <td>7000</td>\n",
       "      <td>10000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1461581470710759424</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1571</th>\n",
       "      <td>1461593160642854912</td>\n",
       "      <td>2021-12-08 11:13</td>\n",
       "      <td>2021-12-08 11:13</td>\n",
       "      <td>2</td>\n",
       "      <td>大数据开发工程师</td>\n",
       "      <td>7000</td>\n",
       "      <td>12000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1461581151616499712</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1572</th>\n",
       "      <td>1461595991152132096</td>\n",
       "      <td>2021-12-08 11:12</td>\n",
       "      <td>2021-12-08 11:12</td>\n",
       "      <td>2</td>\n",
       "      <td>数据分析师</td>\n",
       "      <td>6000</td>\n",
       "      <td>10000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1461580911534538752</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1573</th>\n",
       "      <td>1463067678737563648</td>\n",
       "      <td>2021-12-08 11:05</td>\n",
       "      <td>2021-12-08 11:06</td>\n",
       "      <td>2</td>\n",
       "      <td>Java开发工程师</td>\n",
       "      <td>8000</td>\n",
       "      <td>15000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1461579387123138560</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1574</th>\n",
       "      <td>1463066895702949888</td>\n",
       "      <td>2021-12-08 11:06</td>\n",
       "      <td>2021-12-08 11:06</td>\n",
       "      <td>2</td>\n",
       "      <td>开发工程师</td>\n",
       "      <td>9000</td>\n",
       "      <td>15000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1461579387123138560</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1575 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                    jobId       publishTime        updateTime  willNature  \\\n",
       "0     1648527394191052802  2023-04-19 11:25  2023-04-19 11:22           2   \n",
       "1     1648527394191052801  2023-04-19 11:26  2023-04-19 11:22           2   \n",
       "2     1648527394191052800  2023-04-19 11:27  2023-04-19 11:22           2   \n",
       "3     1648165203084447745  2023-04-18 11:35  2023-04-18 11:23           2   \n",
       "4     1648165203084447744               NaN  2023-04-18 11:23           2   \n",
       "...                   ...               ...               ...         ...   \n",
       "1570  1461591923750993920  2021-12-08 11:14  2021-12-08 11:14           2   \n",
       "1571  1461593160642854912  2021-12-08 11:13  2021-12-08 11:13           2   \n",
       "1572  1461595991152132096  2021-12-08 11:12  2021-12-08 11:12           2   \n",
       "1573  1463067678737563648  2021-12-08 11:05  2021-12-08 11:06           2   \n",
       "1574  1463066895702949888  2021-12-08 11:06  2021-12-08 11:06           2   \n",
       "\n",
       "     positionName  minimumWage  maximumWage  payMethod   exp  \\\n",
       "0          售前技术支持        12000        20000          1    不限   \n",
       "1        数据安全高级经理        16000        26000          1    不限   \n",
       "2        数据安全项目经理        15000        25000          1    不限   \n",
       "3         数据挖掘工程师         5000         8000          1    不限   \n",
       "4            数据管理         3500         6000          1    不限   \n",
       "...           ...          ...          ...        ...   ...   \n",
       "1570      数据分析工程师         7000        10000          1  经验不限   \n",
       "1571     大数据开发工程师         7000        12000          1  经验不限   \n",
       "1572        数据分析师         6000        10000          1  经验不限   \n",
       "1573    Java开发工程师         8000        15000          1  经验不限   \n",
       "1574        开发工程师         9000        15000          1  经验不限   \n",
       "\n",
       "      educationalRequirements  count         enterpriseId  \n",
       "0                           3      6  1648526958960709632  \n",
       "1                           3     10  1648526958960709632  \n",
       "2                           3    100  1648526958960709632  \n",
       "3                           3      5  1648164335723352064  \n",
       "4                           3      6  1648164335723352064  \n",
       "...                       ...    ...                  ...  \n",
       "1570                        3      0  1461581470710759424  \n",
       "1571                        3      0  1461581151616499712  \n",
       "1572                        3      0  1461580911534538752  \n",
       "1573                        3      0  1461579387123138560  \n",
       "1574                        3      0  1461579387123138560  \n",
       "\n",
       "[1575 rows x 12 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Job = Job[Job['jobId'].map(lambda x :  x not in list(missing_data))].reset_index(drop=True)\n",
    "Job"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.publishTime大量缺失，且只有8条数据与update_time不同，因此去除publishTime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True     229\n",
      "False      8\n",
      "Name: updateTime, dtype: int64\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>jobId</th>\n",
       "      <th>updateTime</th>\n",
       "      <th>willNature</th>\n",
       "      <th>positionName</th>\n",
       "      <th>minimumWage</th>\n",
       "      <th>maximumWage</th>\n",
       "      <th>payMethod</th>\n",
       "      <th>exp</th>\n",
       "      <th>educationalRequirements</th>\n",
       "      <th>count</th>\n",
       "      <th>enterpriseId</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1648527394191052802</td>\n",
       "      <td>2023-04-19 11:22</td>\n",
       "      <td>2</td>\n",
       "      <td>售前技术支持</td>\n",
       "      <td>12000</td>\n",
       "      <td>20000</td>\n",
       "      <td>1</td>\n",
       "      <td>不限</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>1648526958960709632</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1648527394191052801</td>\n",
       "      <td>2023-04-19 11:22</td>\n",
       "      <td>2</td>\n",
       "      <td>数据安全高级经理</td>\n",
       "      <td>16000</td>\n",
       "      <td>26000</td>\n",
       "      <td>1</td>\n",
       "      <td>不限</td>\n",
       "      <td>3</td>\n",
       "      <td>10</td>\n",
       "      <td>1648526958960709632</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1648527394191052800</td>\n",
       "      <td>2023-04-19 11:22</td>\n",
       "      <td>2</td>\n",
       "      <td>数据安全项目经理</td>\n",
       "      <td>15000</td>\n",
       "      <td>25000</td>\n",
       "      <td>1</td>\n",
       "      <td>不限</td>\n",
       "      <td>3</td>\n",
       "      <td>100</td>\n",
       "      <td>1648526958960709632</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1648165203084447745</td>\n",
       "      <td>2023-04-18 11:23</td>\n",
       "      <td>2</td>\n",
       "      <td>数据挖掘工程师</td>\n",
       "      <td>5000</td>\n",
       "      <td>8000</td>\n",
       "      <td>1</td>\n",
       "      <td>不限</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>1648164335723352064</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1648165203084447744</td>\n",
       "      <td>2023-04-18 11:23</td>\n",
       "      <td>2</td>\n",
       "      <td>数据管理</td>\n",
       "      <td>3500</td>\n",
       "      <td>6000</td>\n",
       "      <td>1</td>\n",
       "      <td>不限</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>1648164335723352064</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1570</th>\n",
       "      <td>1461591923750993920</td>\n",
       "      <td>2021-12-08 11:14</td>\n",
       "      <td>2</td>\n",
       "      <td>数据分析工程师</td>\n",
       "      <td>7000</td>\n",
       "      <td>10000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1461581470710759424</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1571</th>\n",
       "      <td>1461593160642854912</td>\n",
       "      <td>2021-12-08 11:13</td>\n",
       "      <td>2</td>\n",
       "      <td>大数据开发工程师</td>\n",
       "      <td>7000</td>\n",
       "      <td>12000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1461581151616499712</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1572</th>\n",
       "      <td>1461595991152132096</td>\n",
       "      <td>2021-12-08 11:12</td>\n",
       "      <td>2</td>\n",
       "      <td>数据分析师</td>\n",
       "      <td>6000</td>\n",
       "      <td>10000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1461580911534538752</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1573</th>\n",
       "      <td>1463067678737563648</td>\n",
       "      <td>2021-12-08 11:06</td>\n",
       "      <td>2</td>\n",
       "      <td>Java开发工程师</td>\n",
       "      <td>8000</td>\n",
       "      <td>15000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1461579387123138560</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1574</th>\n",
       "      <td>1463066895702949888</td>\n",
       "      <td>2021-12-08 11:06</td>\n",
       "      <td>2</td>\n",
       "      <td>开发工程师</td>\n",
       "      <td>9000</td>\n",
       "      <td>15000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1461579387123138560</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1575 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                    jobId        updateTime  willNature positionName  \\\n",
       "0     1648527394191052802  2023-04-19 11:22           2       售前技术支持   \n",
       "1     1648527394191052801  2023-04-19 11:22           2     数据安全高级经理   \n",
       "2     1648527394191052800  2023-04-19 11:22           2     数据安全项目经理   \n",
       "3     1648165203084447745  2023-04-18 11:23           2      数据挖掘工程师   \n",
       "4     1648165203084447744  2023-04-18 11:23           2         数据管理   \n",
       "...                   ...               ...         ...          ...   \n",
       "1570  1461591923750993920  2021-12-08 11:14           2      数据分析工程师   \n",
       "1571  1461593160642854912  2021-12-08 11:13           2     大数据开发工程师   \n",
       "1572  1461595991152132096  2021-12-08 11:12           2        数据分析师   \n",
       "1573  1463067678737563648  2021-12-08 11:06           2    Java开发工程师   \n",
       "1574  1463066895702949888  2021-12-08 11:06           2        开发工程师   \n",
       "\n",
       "      minimumWage  maximumWage  payMethod   exp  educationalRequirements  \\\n",
       "0           12000        20000          1    不限                        3   \n",
       "1           16000        26000          1    不限                        3   \n",
       "2           15000        25000          1    不限                        3   \n",
       "3            5000         8000          1    不限                        3   \n",
       "4            3500         6000          1    不限                        3   \n",
       "...           ...          ...        ...   ...                      ...   \n",
       "1570         7000        10000          1  经验不限                        3   \n",
       "1571         7000        12000          1  经验不限                        3   \n",
       "1572         6000        10000          1  经验不限                        3   \n",
       "1573         8000        15000          1  经验不限                        3   \n",
       "1574         9000        15000          1  经验不限                        3   \n",
       "\n",
       "      count         enterpriseId  \n",
       "0         6  1648526958960709632  \n",
       "1        10  1648526958960709632  \n",
       "2       100  1648526958960709632  \n",
       "3         5  1648164335723352064  \n",
       "4         6  1648164335723352064  \n",
       "...     ...                  ...  \n",
       "1570      0  1461581470710759424  \n",
       "1571      0  1461581151616499712  \n",
       "1572      0  1461580911534538752  \n",
       "1573      0  1461579387123138560  \n",
       "1574      0  1461579387123138560  \n",
       "\n",
       "[1575 rows x 11 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print((Job.dropna()['publishTime'].values == Job.dropna()['updateTime']).value_counts())\n",
    "Job = Job[[i for i in Job.columns if i != \"publishTime\"]]\n",
    "Job"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.educationalRequirements进行标签映射\n",
    "\n",
    "educationalRequirementsDic = {\n",
    "    \"博士\":5,\"硕士\":4,\"本科\":3,\"大专\":2,\"技工\":1,\"不限\":0,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>jobId</th>\n",
       "      <th>updateTime</th>\n",
       "      <th>willNature</th>\n",
       "      <th>positionName</th>\n",
       "      <th>minimumWage</th>\n",
       "      <th>maximumWage</th>\n",
       "      <th>payMethod</th>\n",
       "      <th>exp</th>\n",
       "      <th>educationalRequirements</th>\n",
       "      <th>count</th>\n",
       "      <th>enterpriseId</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1648527394191052802</td>\n",
       "      <td>2023-04-19 11:22</td>\n",
       "      <td>2</td>\n",
       "      <td>售前技术支持</td>\n",
       "      <td>12000</td>\n",
       "      <td>20000</td>\n",
       "      <td>1</td>\n",
       "      <td>不限</td>\n",
       "      <td>本科</td>\n",
       "      <td>6</td>\n",
       "      <td>1648526958960709632</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1648527394191052801</td>\n",
       "      <td>2023-04-19 11:22</td>\n",
       "      <td>2</td>\n",
       "      <td>数据安全高级经理</td>\n",
       "      <td>16000</td>\n",
       "      <td>26000</td>\n",
       "      <td>1</td>\n",
       "      <td>不限</td>\n",
       "      <td>本科</td>\n",
       "      <td>10</td>\n",
       "      <td>1648526958960709632</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1648527394191052800</td>\n",
       "      <td>2023-04-19 11:22</td>\n",
       "      <td>2</td>\n",
       "      <td>数据安全项目经理</td>\n",
       "      <td>15000</td>\n",
       "      <td>25000</td>\n",
       "      <td>1</td>\n",
       "      <td>不限</td>\n",
       "      <td>本科</td>\n",
       "      <td>100</td>\n",
       "      <td>1648526958960709632</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1648165203084447745</td>\n",
       "      <td>2023-04-18 11:23</td>\n",
       "      <td>2</td>\n",
       "      <td>数据挖掘工程师</td>\n",
       "      <td>5000</td>\n",
       "      <td>8000</td>\n",
       "      <td>1</td>\n",
       "      <td>不限</td>\n",
       "      <td>本科</td>\n",
       "      <td>5</td>\n",
       "      <td>1648164335723352064</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1648165203084447744</td>\n",
       "      <td>2023-04-18 11:23</td>\n",
       "      <td>2</td>\n",
       "      <td>数据管理</td>\n",
       "      <td>3500</td>\n",
       "      <td>6000</td>\n",
       "      <td>1</td>\n",
       "      <td>不限</td>\n",
       "      <td>本科</td>\n",
       "      <td>6</td>\n",
       "      <td>1648164335723352064</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1570</th>\n",
       "      <td>1461591923750993920</td>\n",
       "      <td>2021-12-08 11:14</td>\n",
       "      <td>2</td>\n",
       "      <td>数据分析工程师</td>\n",
       "      <td>7000</td>\n",
       "      <td>10000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>本科</td>\n",
       "      <td>0</td>\n",
       "      <td>1461581470710759424</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1571</th>\n",
       "      <td>1461593160642854912</td>\n",
       "      <td>2021-12-08 11:13</td>\n",
       "      <td>2</td>\n",
       "      <td>大数据开发工程师</td>\n",
       "      <td>7000</td>\n",
       "      <td>12000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>本科</td>\n",
       "      <td>0</td>\n",
       "      <td>1461581151616499712</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1572</th>\n",
       "      <td>1461595991152132096</td>\n",
       "      <td>2021-12-08 11:12</td>\n",
       "      <td>2</td>\n",
       "      <td>数据分析师</td>\n",
       "      <td>6000</td>\n",
       "      <td>10000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>本科</td>\n",
       "      <td>0</td>\n",
       "      <td>1461580911534538752</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1573</th>\n",
       "      <td>1463067678737563648</td>\n",
       "      <td>2021-12-08 11:06</td>\n",
       "      <td>2</td>\n",
       "      <td>Java开发工程师</td>\n",
       "      <td>8000</td>\n",
       "      <td>15000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>本科</td>\n",
       "      <td>0</td>\n",
       "      <td>1461579387123138560</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1574</th>\n",
       "      <td>1463066895702949888</td>\n",
       "      <td>2021-12-08 11:06</td>\n",
       "      <td>2</td>\n",
       "      <td>开发工程师</td>\n",
       "      <td>9000</td>\n",
       "      <td>15000</td>\n",
       "      <td>1</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>本科</td>\n",
       "      <td>0</td>\n",
       "      <td>1461579387123138560</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1575 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                    jobId        updateTime  willNature positionName  \\\n",
       "0     1648527394191052802  2023-04-19 11:22           2       售前技术支持   \n",
       "1     1648527394191052801  2023-04-19 11:22           2     数据安全高级经理   \n",
       "2     1648527394191052800  2023-04-19 11:22           2     数据安全项目经理   \n",
       "3     1648165203084447745  2023-04-18 11:23           2      数据挖掘工程师   \n",
       "4     1648165203084447744  2023-04-18 11:23           2         数据管理   \n",
       "...                   ...               ...         ...          ...   \n",
       "1570  1461591923750993920  2021-12-08 11:14           2      数据分析工程师   \n",
       "1571  1461593160642854912  2021-12-08 11:13           2     大数据开发工程师   \n",
       "1572  1461595991152132096  2021-12-08 11:12           2        数据分析师   \n",
       "1573  1463067678737563648  2021-12-08 11:06           2    Java开发工程师   \n",
       "1574  1463066895702949888  2021-12-08 11:06           2        开发工程师   \n",
       "\n",
       "      minimumWage  maximumWage  payMethod   exp educationalRequirements  \\\n",
       "0           12000        20000          1    不限                      本科   \n",
       "1           16000        26000          1    不限                      本科   \n",
       "2           15000        25000          1    不限                      本科   \n",
       "3            5000         8000          1    不限                      本科   \n",
       "4            3500         6000          1    不限                      本科   \n",
       "...           ...          ...        ...   ...                     ...   \n",
       "1570         7000        10000          1  经验不限                      本科   \n",
       "1571         7000        12000          1  经验不限                      本科   \n",
       "1572         6000        10000          1  经验不限                      本科   \n",
       "1573         8000        15000          1  经验不限                      本科   \n",
       "1574         9000        15000          1  经验不限                      本科   \n",
       "\n",
       "      count         enterpriseId  \n",
       "0         6  1648526958960709632  \n",
       "1        10  1648526958960709632  \n",
       "2       100  1648526958960709632  \n",
       "3         5  1648164335723352064  \n",
       "4         6  1648164335723352064  \n",
       "...     ...                  ...  \n",
       "1570      0  1461581470710759424  \n",
       "1571      0  1461581151616499712  \n",
       "1572      0  1461580911534538752  \n",
       "1573      0  1461579387123138560  \n",
       "1574      0  1461579387123138560  \n",
       "\n",
       "[1575 rows x 11 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "educationalRequirementsDic = {\n",
    "    \"博士\":5,\"硕士\":4,\"本科\":3,\"大专\":2,\"技工\":1,\"不限\":0,\n",
    "}\n",
    "\n",
    "Job['educationalRequirements'] = Job['educationalRequirements'].map(utils.ReverseDic(educationalRequirementsDic))\n",
    "Job"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.对willnature进行标签映射"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "willNatureDic = {\"全职\":2,\"实习\":0}\n",
    "Job['willNature'] = Job['willNature'].map(utils.ReverseDic(willNatureDic))"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.exp字段处理\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3        465\n",
       "1        348\n",
       "经验不限     258\n",
       "5        168\n",
       "0        159\n",
       "1-3年      55\n",
       "不限        54\n",
       "3-5年      52\n",
       "5-7年       6\n",
       "10         6\n",
       "7年以上       3\n",
       "5-10年      1\n",
       "Name: exp, dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Job['exp'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       471\n",
       "3       465\n",
       "1       348\n",
       "5       168\n",
       "1-3      55\n",
       "3-5      52\n",
       "5-7       6\n",
       "10        6\n",
       "7以上       3\n",
       "5-10      1\n",
       "Name: exp, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Job['exp'].value_counts()\n",
    "# 1.将经验不限、不限、0统一转换为0\n",
    "# 2.将年去掉\n",
    "Job['exp'] = Job['exp'].map(lambda x:str(0) if x==\"经验不限\" or x==\"不限\" else x)\n",
    "Job['exp'] = Job['exp'].map(lambda x:x.replace(\"年\",\"\") if \"年\" in x else x)\n",
    "Job['exp'].value_counts()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "企业要求一般为最低年限，因此，在这里对于某些区间范围我们将取最小值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3     517\n",
       "0     471\n",
       "1     403\n",
       "5     175\n",
       "10      6\n",
       "7       3\n",
       "Name: exp, dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 标签映射\n",
    "ExpDic = {\"1-3\":\"1\",\"3-5\":\"3\",\"5-7\":\"5\",\"7以上\":\"7\",\"5-10\":\"5\",\"3\":\"3\",\"0\":\"0\",\"1\":\"1\",\"5\":\"5\",\"10\":\"10\"}\n",
    "\n",
    "Job['exp'] = Job['exp'].map(ExpDic)\n",
    "Job['exp'].value_counts()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.对paymethod进行处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    1569\n",
       "2       5\n",
       "0       1\n",
       "Name: payMethod, dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Job['payMethod'].value_counts()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "paymethod==0代表年薪，2代表日新，统一成月薪"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>jobId</th>\n",
       "      <th>updateTime</th>\n",
       "      <th>willNature</th>\n",
       "      <th>positionName</th>\n",
       "      <th>minimumWage</th>\n",
       "      <th>maximumWage</th>\n",
       "      <th>exp</th>\n",
       "      <th>educationalRequirements</th>\n",
       "      <th>count</th>\n",
       "      <th>enterpriseId</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1648527394191052802</td>\n",
       "      <td>2023-04-19 11:22</td>\n",
       "      <td>全职</td>\n",
       "      <td>售前技术支持</td>\n",
       "      <td>12000.0</td>\n",
       "      <td>20000.0</td>\n",
       "      <td>0</td>\n",
       "      <td>本科</td>\n",
       "      <td>6</td>\n",
       "      <td>1648526958960709632</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1648527394191052801</td>\n",
       "      <td>2023-04-19 11:22</td>\n",
       "      <td>全职</td>\n",
       "      <td>数据安全高级经理</td>\n",
       "      <td>16000.0</td>\n",
       "      <td>26000.0</td>\n",
       "      <td>0</td>\n",
       "      <td>本科</td>\n",
       "      <td>10</td>\n",
       "      <td>1648526958960709632</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1648527394191052800</td>\n",
       "      <td>2023-04-19 11:22</td>\n",
       "      <td>全职</td>\n",
       "      <td>数据安全项目经理</td>\n",
       "      <td>15000.0</td>\n",
       "      <td>25000.0</td>\n",
       "      <td>0</td>\n",
       "      <td>本科</td>\n",
       "      <td>100</td>\n",
       "      <td>1648526958960709632</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1648165203084447745</td>\n",
       "      <td>2023-04-18 11:23</td>\n",
       "      <td>全职</td>\n",
       "      <td>数据挖掘工程师</td>\n",
       "      <td>5000.0</td>\n",
       "      <td>8000.0</td>\n",
       "      <td>0</td>\n",
       "      <td>本科</td>\n",
       "      <td>5</td>\n",
       "      <td>1648164335723352064</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1648165203084447744</td>\n",
       "      <td>2023-04-18 11:23</td>\n",
       "      <td>全职</td>\n",
       "      <td>数据管理</td>\n",
       "      <td>3500.0</td>\n",
       "      <td>6000.0</td>\n",
       "      <td>0</td>\n",
       "      <td>本科</td>\n",
       "      <td>6</td>\n",
       "      <td>1648164335723352064</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1570</th>\n",
       "      <td>1461591923750993920</td>\n",
       "      <td>2021-12-08 11:14</td>\n",
       "      <td>全职</td>\n",
       "      <td>数据分析工程师</td>\n",
       "      <td>7000.0</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>0</td>\n",
       "      <td>本科</td>\n",
       "      <td>0</td>\n",
       "      <td>1461581470710759424</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1571</th>\n",
       "      <td>1461593160642854912</td>\n",
       "      <td>2021-12-08 11:13</td>\n",
       "      <td>全职</td>\n",
       "      <td>大数据开发工程师</td>\n",
       "      <td>7000.0</td>\n",
       "      <td>12000.0</td>\n",
       "      <td>0</td>\n",
       "      <td>本科</td>\n",
       "      <td>0</td>\n",
       "      <td>1461581151616499712</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1572</th>\n",
       "      <td>1461595991152132096</td>\n",
       "      <td>2021-12-08 11:12</td>\n",
       "      <td>全职</td>\n",
       "      <td>数据分析师</td>\n",
       "      <td>6000.0</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>0</td>\n",
       "      <td>本科</td>\n",
       "      <td>0</td>\n",
       "      <td>1461580911534538752</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1573</th>\n",
       "      <td>1463067678737563648</td>\n",
       "      <td>2021-12-08 11:06</td>\n",
       "      <td>全职</td>\n",
       "      <td>Java开发工程师</td>\n",
       "      <td>8000.0</td>\n",
       "      <td>15000.0</td>\n",
       "      <td>0</td>\n",
       "      <td>本科</td>\n",
       "      <td>0</td>\n",
       "      <td>1461579387123138560</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1574</th>\n",
       "      <td>1463066895702949888</td>\n",
       "      <td>2021-12-08 11:06</td>\n",
       "      <td>全职</td>\n",
       "      <td>开发工程师</td>\n",
       "      <td>9000.0</td>\n",
       "      <td>15000.0</td>\n",
       "      <td>0</td>\n",
       "      <td>本科</td>\n",
       "      <td>0</td>\n",
       "      <td>1461579387123138560</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1575 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                    jobId        updateTime willNature positionName  \\\n",
       "0     1648527394191052802  2023-04-19 11:22         全职       售前技术支持   \n",
       "1     1648527394191052801  2023-04-19 11:22         全职     数据安全高级经理   \n",
       "2     1648527394191052800  2023-04-19 11:22         全职     数据安全项目经理   \n",
       "3     1648165203084447745  2023-04-18 11:23         全职      数据挖掘工程师   \n",
       "4     1648165203084447744  2023-04-18 11:23         全职         数据管理   \n",
       "...                   ...               ...        ...          ...   \n",
       "1570  1461591923750993920  2021-12-08 11:14         全职      数据分析工程师   \n",
       "1571  1461593160642854912  2021-12-08 11:13         全职     大数据开发工程师   \n",
       "1572  1461595991152132096  2021-12-08 11:12         全职        数据分析师   \n",
       "1573  1463067678737563648  2021-12-08 11:06         全职    Java开发工程师   \n",
       "1574  1463066895702949888  2021-12-08 11:06         全职        开发工程师   \n",
       "\n",
       "      minimumWage  maximumWage exp educationalRequirements  count  \\\n",
       "0         12000.0      20000.0   0                      本科      6   \n",
       "1         16000.0      26000.0   0                      本科     10   \n",
       "2         15000.0      25000.0   0                      本科    100   \n",
       "3          5000.0       8000.0   0                      本科      5   \n",
       "4          3500.0       6000.0   0                      本科      6   \n",
       "...           ...          ...  ..                     ...    ...   \n",
       "1570       7000.0      10000.0   0                      本科      0   \n",
       "1571       7000.0      12000.0   0                      本科      0   \n",
       "1572       6000.0      10000.0   0                      本科      0   \n",
       "1573       8000.0      15000.0   0                      本科      0   \n",
       "1574       9000.0      15000.0   0                      本科      0   \n",
       "\n",
       "             enterpriseId  \n",
       "0     1648526958960709632  \n",
       "1     1648526958960709632  \n",
       "2     1648526958960709632  \n",
       "3     1648164335723352064  \n",
       "4     1648164335723352064  \n",
       "...                   ...  \n",
       "1570  1461581470710759424  \n",
       "1571  1461581151616499712  \n",
       "1572  1461580911534538752  \n",
       "1573  1461579387123138560  \n",
       "1574  1461579387123138560  \n",
       "\n",
       "[1575 rows x 10 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Job[Job['payMethod'] == 0]                  # 0代表年薪\n",
    "# 将其转换为月薪\n",
    "Job['minimumWage'] = Job.apply(lambda x: x[\"minimumWage\"]/12 if x['payMethod']==0 else x['minimumWage'],axis=1) \n",
    "Job['maximumWage'] = Job.apply(lambda x: x[\"maximumWage\"]/12 if x['payMethod']==0 else x['maximumWage'],axis=1)\n",
    "\n",
    "Job[Job['payMethod'] == 2]                  # 2代表日新\n",
    "# 转换为月薪\n",
    "Job['minimumWage'] = Job.apply(lambda x: x[\"minimumWage\"]*30 if x['payMethod']==2 else x['minimumWage'],axis=1) \n",
    "Job['maximumWage'] = Job.apply(lambda x: x[\"maximumWage\"]*30 if x['payMethod']==2 else x['maximumWage'],axis=1)\n",
    "\n",
    "# payMethod\n",
    "Job = Job[[i for i in Job.columns if i!= \"payMethod\"]]\n",
    "Job"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.count处理\n",
    "\n",
    "count代表的是工作的数量，0代表不限人数"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 7.加一个平均薪资范畴"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f(row):\n",
    "    row['averageSalary'] = (row['minimumWage']+row['maximumWage'])/2\n",
    "    return row\n",
    "Job = Job.apply(lambda row: f(row),axis=1)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据清洗——JobDetail"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>jobId</th>\n",
       "      <th>jobRequiredments</th>\n",
       "      <th>welfare</th>\n",
       "      <th>workplace</th>\n",
       "      <th>deadline</th>\n",
       "      <th>function</th>\n",
       "      <th>publisher</th>\n",
       "      <th>status</th>\n",
       "      <th>publisherName</th>\n",
       "      <th>enterpriseName</th>\n",
       "      <th>messageTemplateId</th>\n",
       "      <th>keywordList</th>\n",
       "      <th>skillsList</th>\n",
       "      <th>resumeCount</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1648527394191052801</td>\n",
       "      <td>主要职责：\\n1、辅助集团数据安全规划工作，梳理摸底调研思路并定位现状和需求，设计可落地解决...</td>\n",
       "      <td>[\"餐饮补贴\",\"专业培训\",\"弹性工作\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2023-12-31 00:00</td>\n",
       "      <td>[\"大数据分析师\"]</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>超级管理员</td>\n",
       "      <td>深圳市众云网有限公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>['互联网', '软件']</td>\n",
       "      <td>['数据安全', '数据可视化', '数据挖掘']</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1648527394191052800</td>\n",
       "      <td>主要职责：\\n1、负责集团内数据安全项目的推广工作，按照部门数据安全建设规划，与集团内客户开...</td>\n",
       "      <td>[\"餐饮补贴\",\"专业培训\",\"弹性工作\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2023-12-31 00:00</td>\n",
       "      <td>[\"大数据分析师\"]</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>超级管理员</td>\n",
       "      <td>深圳市众云网有限公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>['互联网', '软件']</td>\n",
       "      <td>['数据安全', '数据可视化', '数据挖掘']</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1648527394191052802</td>\n",
       "      <td>岗位职责：\\n1、负责配合客户及项目需要，完成技术交流，提供解决方案；\\n2、负责网络安全、...</td>\n",
       "      <td>[\"餐饮补贴\",\"专业培训\",\"弹性工作\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2023-12-31 00:00</td>\n",
       "      <td>[\"大数据分析师\"]</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>超级管理员</td>\n",
       "      <td>深圳市众云网有限公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>['互联网', '软件']</td>\n",
       "      <td>['网络安全', '云计算']</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1631112859985510400</td>\n",
       "      <td>1. 深入了解区域项目，搭建项目概算模板； \\n\\n2. 负责中小项目的项目概算工作； \\n...</td>\n",
       "      <td>[\"餐饮补贴\",\"专业培训\",\"弹性工作\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2024-01-01 00:00</td>\n",
       "      <td>[\"财务实习生\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>admin</td>\n",
       "      <td>深圳市海柔创新科技有限公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>['互联网', '人工智能']</td>\n",
       "      <td>[]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1648165203084447745</td>\n",
       "      <td>1、全日制本科及以上学历，计算机、数学、统计等相关 专业：超过三年的相关工作经验：\\n2、熟...</td>\n",
       "      <td>[\"餐饮补贴\",\"专业培训\",\"弹性工作\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2023-12-31 00:00</td>\n",
       "      <td>[\"大数据分析师\"]</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>超级管理员</td>\n",
       "      <td>济宁森羽网络科技有限公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>['互联网', '大数据']</td>\n",
       "      <td>['数据可视化', '机器学习', '数据挖掘']</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 jobId                                   jobRequiredments  \\\n",
       "0  1648527394191052801  主要职责：\\n1、辅助集团数据安全规划工作，梳理摸底调研思路并定位现状和需求，设计可落地解决...   \n",
       "1  1648527394191052800  主要职责：\\n1、负责集团内数据安全项目的推广工作，按照部门数据安全建设规划，与集团内客户开...   \n",
       "2  1648527394191052802  岗位职责：\\n1、负责配合客户及项目需要，完成技术交流，提供解决方案；\\n2、负责网络安全、...   \n",
       "3  1631112859985510400  1. 深入了解区域项目，搭建项目概算模板； \\n\\n2. 负责中小项目的项目概算工作； \\n...   \n",
       "4  1648165203084447745  1、全日制本科及以上学历，计算机、数学、统计等相关 专业：超过三年的相关工作经验：\\n2、熟...   \n",
       "\n",
       "                  welfare workplace          deadline    function  publisher  \\\n",
       "0  [\"餐饮补贴\",\"专业培训\",\"弹性工作\"]       NaN  2023-12-31 00:00  [\"大数据分析师\"]        1.0   \n",
       "1  [\"餐饮补贴\",\"专业培训\",\"弹性工作\"]       NaN  2023-12-31 00:00  [\"大数据分析师\"]        1.0   \n",
       "2  [\"餐饮补贴\",\"专业培训\",\"弹性工作\"]       NaN  2023-12-31 00:00  [\"大数据分析师\"]        1.0   \n",
       "3  [\"餐饮补贴\",\"专业培训\",\"弹性工作\"]       NaN  2024-01-01 00:00   [\"财务实习生\"]        NaN   \n",
       "4  [\"餐饮补贴\",\"专业培训\",\"弹性工作\"]       NaN  2023-12-31 00:00  [\"大数据分析师\"]        1.0   \n",
       "\n",
       "   status publisherName enterpriseName  messageTemplateId      keywordList  \\\n",
       "0       1         超级管理员     深圳市众云网有限公司                NaN    ['互联网', '软件']   \n",
       "1       1         超级管理员     深圳市众云网有限公司                NaN    ['互联网', '软件']   \n",
       "2       1         超级管理员     深圳市众云网有限公司                NaN    ['互联网', '软件']   \n",
       "3       1         admin  深圳市海柔创新科技有限公司                NaN  ['互联网', '人工智能']   \n",
       "4       1         超级管理员   济宁森羽网络科技有限公司                NaN   ['互联网', '大数据']   \n",
       "\n",
       "                  skillsList  resumeCount  \n",
       "0  ['数据安全', '数据可视化', '数据挖掘']            0  \n",
       "1  ['数据安全', '数据可视化', '数据挖掘']            0  \n",
       "2            ['网络安全', '云计算']            0  \n",
       "3                         []            2  \n",
       "4  ['数据可视化', '机器学习', '数据挖掘']            0  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "JobDetail.head(5)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.status处理，publisherName处理，publisher，messageTemplateId\n",
    "\n",
    "status全是1，直接删除\n",
    "\n",
    "publisherName为发布者，直接删除\n",
    "\n",
    "publisher 也是发布者相关信息，直接删除\n",
    "\n",
    "messageTemplateId全是NaN，直接删除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>jobId</th>\n",
       "      <th>jobRequiredments</th>\n",
       "      <th>welfare</th>\n",
       "      <th>workplace</th>\n",
       "      <th>deadline</th>\n",
       "      <th>function</th>\n",
       "      <th>enterpriseName</th>\n",
       "      <th>keywordList</th>\n",
       "      <th>skillsList</th>\n",
       "      <th>resumeCount</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1648527394191052801</td>\n",
       "      <td>主要职责：\\n1、辅助集团数据安全规划工作，梳理摸底调研思路并定位现状和需求，设计可落地解决...</td>\n",
       "      <td>[\"餐饮补贴\",\"专业培训\",\"弹性工作\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2023-12-31 00:00</td>\n",
       "      <td>[\"大数据分析师\"]</td>\n",
       "      <td>深圳市众云网有限公司</td>\n",
       "      <td>['互联网', '软件']</td>\n",
       "      <td>['数据安全', '数据可视化', '数据挖掘']</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1648527394191052800</td>\n",
       "      <td>主要职责：\\n1、负责集团内数据安全项目的推广工作，按照部门数据安全建设规划，与集团内客户开...</td>\n",
       "      <td>[\"餐饮补贴\",\"专业培训\",\"弹性工作\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2023-12-31 00:00</td>\n",
       "      <td>[\"大数据分析师\"]</td>\n",
       "      <td>深圳市众云网有限公司</td>\n",
       "      <td>['互联网', '软件']</td>\n",
       "      <td>['数据安全', '数据可视化', '数据挖掘']</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1648527394191052802</td>\n",
       "      <td>岗位职责：\\n1、负责配合客户及项目需要，完成技术交流，提供解决方案；\\n2、负责网络安全、...</td>\n",
       "      <td>[\"餐饮补贴\",\"专业培训\",\"弹性工作\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2023-12-31 00:00</td>\n",
       "      <td>[\"大数据分析师\"]</td>\n",
       "      <td>深圳市众云网有限公司</td>\n",
       "      <td>['互联网', '软件']</td>\n",
       "      <td>['网络安全', '云计算']</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1631112859985510400</td>\n",
       "      <td>1. 深入了解区域项目，搭建项目概算模板； \\n\\n2. 负责中小项目的项目概算工作； \\n...</td>\n",
       "      <td>[\"餐饮补贴\",\"专业培训\",\"弹性工作\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2024-01-01 00:00</td>\n",
       "      <td>[\"财务实习生\"]</td>\n",
       "      <td>深圳市海柔创新科技有限公司</td>\n",
       "      <td>['互联网', '人工智能']</td>\n",
       "      <td>[]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1648165203084447745</td>\n",
       "      <td>1、全日制本科及以上学历，计算机、数学、统计等相关 专业：超过三年的相关工作经验：\\n2、熟...</td>\n",
       "      <td>[\"餐饮补贴\",\"专业培训\",\"弹性工作\"]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2023-12-31 00:00</td>\n",
       "      <td>[\"大数据分析师\"]</td>\n",
       "      <td>济宁森羽网络科技有限公司</td>\n",
       "      <td>['互联网', '大数据']</td>\n",
       "      <td>['数据可视化', '机器学习', '数据挖掘']</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1570</th>\n",
       "      <td>1461590578927108096</td>\n",
       "      <td>岗位职责：\\n1、根据指定业务场景进行数据分析，撰写分析报告。 \\n2、基于历史数据与实操经...</td>\n",
       "      <td>[\"五险一金\",\"出国机会\",\"免费班车\",\"绩效奖金\",\"员工旅游\",\"年终奖金\",\"交通...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2021-12-08 11:13</td>\n",
       "      <td>[\"数据分析师\"]</td>\n",
       "      <td>上海众言网络科技有限公司</td>\n",
       "      <td>['数据分析', '大数据分析', '人工智能', 'SPSS', '数学建模']</td>\n",
       "      <td>['数据探索', '数据通信', '数据分析', '数据查询', '数据建模']</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1571</th>\n",
       "      <td>1463066895702949888</td>\n",
       "      <td>岗位职责：软件开发\\n岗位要求;1、 熟练掌握Spring MVC、Spring boot架...</td>\n",
       "      <td>[\"五险一金\",\"出国机会\",\"绩效奖金\",\"年终奖金\",\"员工旅游\",\"交通补贴\",\"股票...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2021-12-08 11:05</td>\n",
       "      <td>[\"其他\"]</td>\n",
       "      <td>中软国际</td>\n",
       "      <td>['软件开发', '系统开发', '数据挖掘']</td>\n",
       "      <td>['数据查询', '数据分析', '数据建模', '数据探索', '接口开发']</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1572</th>\n",
       "      <td>1463067678737563648</td>\n",
       "      <td>岗位职责：\\n1.搜集整理汇总软件开发新兴技术报告；\\n2.研究并整理教育政策以及高等院校校...</td>\n",
       "      <td>[\"五险一金\",\"出国机会\",\"绩效奖金\",\"员工旅游\",\"年终奖金\",\"交通补贴\",\"弹性...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2021-12-08 11:05</td>\n",
       "      <td>[\"其他\",\"算法工程师\"]</td>\n",
       "      <td>中软国际</td>\n",
       "      <td>['Java开发', '系统开发', '代码编写', 'Linux命令', '云计算平台']</td>\n",
       "      <td>['数据分析', '数据采集', '数据探索', '系统运维', '数据挖掘']</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1573</th>\n",
       "      <td>1461591923750993920</td>\n",
       "      <td>岗位职责：\\n1、根据论文、文献标准化各类教学模型的输入输出，产出数学模型分析模板原型，形成...</td>\n",
       "      <td>[\"五险一金\",\"免费班车\",\"员工旅游\",\"交通补贴\",\"餐饮补贴\",\"通讯补贴\",\"专业...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2021-12-08 11:13</td>\n",
       "      <td>[\"数据分析师\"]</td>\n",
       "      <td>上海众言网络科技有限公司</td>\n",
       "      <td>['数据分析', '大数据分析', '人工智能', '产品发布', '产品设计']</td>\n",
       "      <td>['数据探索', '数据采集', '数据查询', '数据分析', '数据计算']</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1574</th>\n",
       "      <td>1461593160642854912</td>\n",
       "      <td>岗位职责：\\n1.本科及以上学历，计算机类相关专业，有相关程序开发经验； \\n2、熟悉一门高...</td>\n",
       "      <td>[\"五险一金\",\"出国机会\",\"免费班车\",\"绩效奖金\",\"员工旅游\",\"年终奖金\",\"交通...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2021-12-08 11:12</td>\n",
       "      <td>[\"算法工程师\"]</td>\n",
       "      <td>深圳市明源云客电子商务有限公司</td>\n",
       "      <td>['数据分析', '数据开发', 'Scala', 'Python']</td>\n",
       "      <td>['数据探索', '数据查询', '数据建模', '数据计算', '数据预处理']</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1575 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                    jobId                                   jobRequiredments  \\\n",
       "0     1648527394191052801  主要职责：\\n1、辅助集团数据安全规划工作，梳理摸底调研思路并定位现状和需求，设计可落地解决...   \n",
       "1     1648527394191052800  主要职责：\\n1、负责集团内数据安全项目的推广工作，按照部门数据安全建设规划，与集团内客户开...   \n",
       "2     1648527394191052802  岗位职责：\\n1、负责配合客户及项目需要，完成技术交流，提供解决方案；\\n2、负责网络安全、...   \n",
       "3     1631112859985510400  1. 深入了解区域项目，搭建项目概算模板； \\n\\n2. 负责中小项目的项目概算工作； \\n...   \n",
       "4     1648165203084447745  1、全日制本科及以上学历，计算机、数学、统计等相关 专业：超过三年的相关工作经验：\\n2、熟...   \n",
       "...                   ...                                                ...   \n",
       "1570  1461590578927108096  岗位职责：\\n1、根据指定业务场景进行数据分析，撰写分析报告。 \\n2、基于历史数据与实操经...   \n",
       "1571  1463066895702949888  岗位职责：软件开发\\n岗位要求;1、 熟练掌握Spring MVC、Spring boot架...   \n",
       "1572  1463067678737563648  岗位职责：\\n1.搜集整理汇总软件开发新兴技术报告；\\n2.研究并整理教育政策以及高等院校校...   \n",
       "1573  1461591923750993920  岗位职责：\\n1、根据论文、文献标准化各类教学模型的输入输出，产出数学模型分析模板原型，形成...   \n",
       "1574  1461593160642854912  岗位职责：\\n1.本科及以上学历，计算机类相关专业，有相关程序开发经验； \\n2、熟悉一门高...   \n",
       "\n",
       "                                                welfare workplace  \\\n",
       "0                                [\"餐饮补贴\",\"专业培训\",\"弹性工作\"]       NaN   \n",
       "1                                [\"餐饮补贴\",\"专业培训\",\"弹性工作\"]       NaN   \n",
       "2                                [\"餐饮补贴\",\"专业培训\",\"弹性工作\"]       NaN   \n",
       "3                                [\"餐饮补贴\",\"专业培训\",\"弹性工作\"]       NaN   \n",
       "4                                [\"餐饮补贴\",\"专业培训\",\"弹性工作\"]       NaN   \n",
       "...                                                 ...       ...   \n",
       "1570  [\"五险一金\",\"出国机会\",\"免费班车\",\"绩效奖金\",\"员工旅游\",\"年终奖金\",\"交通...       NaN   \n",
       "1571  [\"五险一金\",\"出国机会\",\"绩效奖金\",\"年终奖金\",\"员工旅游\",\"交通补贴\",\"股票...       NaN   \n",
       "1572  [\"五险一金\",\"出国机会\",\"绩效奖金\",\"员工旅游\",\"年终奖金\",\"交通补贴\",\"弹性...       NaN   \n",
       "1573  [\"五险一金\",\"免费班车\",\"员工旅游\",\"交通补贴\",\"餐饮补贴\",\"通讯补贴\",\"专业...       NaN   \n",
       "1574  [\"五险一金\",\"出国机会\",\"免费班车\",\"绩效奖金\",\"员工旅游\",\"年终奖金\",\"交通...       NaN   \n",
       "\n",
       "              deadline        function   enterpriseName  \\\n",
       "0     2023-12-31 00:00      [\"大数据分析师\"]       深圳市众云网有限公司   \n",
       "1     2023-12-31 00:00      [\"大数据分析师\"]       深圳市众云网有限公司   \n",
       "2     2023-12-31 00:00      [\"大数据分析师\"]       深圳市众云网有限公司   \n",
       "3     2024-01-01 00:00       [\"财务实习生\"]    深圳市海柔创新科技有限公司   \n",
       "4     2023-12-31 00:00      [\"大数据分析师\"]     济宁森羽网络科技有限公司   \n",
       "...                ...             ...              ...   \n",
       "1570  2021-12-08 11:13       [\"数据分析师\"]     上海众言网络科技有限公司   \n",
       "1571  2021-12-08 11:05          [\"其他\"]             中软国际   \n",
       "1572  2021-12-08 11:05  [\"其他\",\"算法工程师\"]             中软国际   \n",
       "1573  2021-12-08 11:13       [\"数据分析师\"]     上海众言网络科技有限公司   \n",
       "1574  2021-12-08 11:12       [\"算法工程师\"]  深圳市明源云客电子商务有限公司   \n",
       "\n",
       "                                         keywordList  \\\n",
       "0                                      ['互联网', '软件']   \n",
       "1                                      ['互联网', '软件']   \n",
       "2                                      ['互联网', '软件']   \n",
       "3                                    ['互联网', '人工智能']   \n",
       "4                                     ['互联网', '大数据']   \n",
       "...                                              ...   \n",
       "1570       ['数据分析', '大数据分析', '人工智能', 'SPSS', '数学建模']   \n",
       "1571                        ['软件开发', '系统开发', '数据挖掘']   \n",
       "1572  ['Java开发', '系统开发', '代码编写', 'Linux命令', '云计算平台']   \n",
       "1573       ['数据分析', '大数据分析', '人工智能', '产品发布', '产品设计']   \n",
       "1574             ['数据分析', '数据开发', 'Scala', 'Python']   \n",
       "\n",
       "                                     skillsList  resumeCount  \n",
       "0                     ['数据安全', '数据可视化', '数据挖掘']            0  \n",
       "1                     ['数据安全', '数据可视化', '数据挖掘']            0  \n",
       "2                               ['网络安全', '云计算']            0  \n",
       "3                                            []            2  \n",
       "4                     ['数据可视化', '机器学习', '数据挖掘']            0  \n",
       "...                                         ...          ...  \n",
       "1570   ['数据探索', '数据通信', '数据分析', '数据查询', '数据建模']            1  \n",
       "1571   ['数据查询', '数据分析', '数据建模', '数据探索', '接口开发']            0  \n",
       "1572   ['数据分析', '数据采集', '数据探索', '系统运维', '数据挖掘']            0  \n",
       "1573   ['数据探索', '数据采集', '数据查询', '数据分析', '数据计算']            0  \n",
       "1574  ['数据探索', '数据查询', '数据建模', '数据计算', '数据预处理']            3  \n",
       "\n",
       "[1575 rows x 10 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "JobDetail = JobDetail[[i for i in JobDetail.columns if i not in [\"publisherName\",\"status\",\"publisher\",\"messageTemplateId\"]]]\n",
    "JobDetail"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.function处理，去掉[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f(x):\n",
    "    x = x.replace(\"\\\"]\",\"\")\n",
    "    x = x.replace(\"[\\\"\",\"\")\n",
    "    return x\n",
    "JobDetail['function'] = JobDetail['function'].map(f)\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.welfare,keywordList,skillsList数据处理\n",
    "\n",
    "去掉括号，及一些杂七杂八的字符"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f(x):\n",
    "    if x:\n",
    "        if \",\" in x:\n",
    "            x = x.replace(\",\",\"|\")\n",
    "        x = x.replace(\"\\\"\",\"\")\n",
    "        x = x.replace(\"[\",\"\")\n",
    "        x = x.replace(\"]\",\"\")\n",
    "        x = x.replace(\"\\'\",\"\")\n",
    "        return x\n",
    "\n",
    "for i in ['welfare','keywordList','skillsList']:\n",
    "    JobDetail[i] = JobDetail[i].map(f)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CompanyDetail处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>enterpriseId</th>\n",
       "      <th>logo</th>\n",
       "      <th>shortName</th>\n",
       "      <th>industry</th>\n",
       "      <th>econKind</th>\n",
       "      <th>startDate</th>\n",
       "      <th>registCapi</th>\n",
       "      <th>personScope</th>\n",
       "      <th>website</th>\n",
       "      <th>...</th>\n",
       "      <th>photo</th>\n",
       "      <th>label</th>\n",
       "      <th>postCode</th>\n",
       "      <th>recruitJobNum</th>\n",
       "      <th>totalPublicJobNum</th>\n",
       "      <th>provinceCode</th>\n",
       "      <th>cityCode</th>\n",
       "      <th>regionCode</th>\n",
       "      <th>detailedAddress</th>\n",
       "      <th>remarks</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1561969174736142336</td>\n",
       "      <td>1561969173914058752</td>\n",
       "      <td>/91440300MA5GKTA37A/1661238138156/166123813815...</td>\n",
       "      <td>漪畔网络</td>\n",
       "      <td>[\"互联网\",\"大数据\"]</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100-200万</td>\n",
       "      <td>少于50人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>广东省广州市</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1507193335067049984</td>\n",
       "      <td>1507192012301008896</td>\n",
       "      <td>/admin/fd700d14-9b0c-18bc-ae22-3ade75207d54/fd...</td>\n",
       "      <td>海柔创新科技</td>\n",
       "      <td>[\"电子商务\",\"互联网\"]</td>\n",
       "      <td>合资</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>150-500人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>440000.0</td>\n",
       "      <td>440300.0</td>\n",
       "      <td>440306.0</td>\n",
       "      <td>西乡街道南昌社区安络科技产业园B栋201、301、401</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1648526959162036224</td>\n",
       "      <td>1648526958960709632</td>\n",
       "      <td>/admin/1e4afb8d-bea7-aeed-a65e-687d6da3a532/1e...</td>\n",
       "      <td>众云网</td>\n",
       "      <td>[\"互联网\",\"计算机软件\"]</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>150-500人</td>\n",
       "      <td>http://www.ouryun.com.cn/</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>440000.0</td>\n",
       "      <td>440300.0</td>\n",
       "      <td>440305.0</td>\n",
       "      <td>深圳市南山区粤海街道滨海社区海天一路19、17、18号软件产业基地4栋511</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1562373713612111872</td>\n",
       "      <td>1562373712110551040</td>\n",
       "      <td>/914403006188644414/1661334068655/166133406865...</td>\n",
       "      <td>中银信息</td>\n",
       "      <td>[\"互联网\",\"大数据\"]</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>150-500人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>广东省广州市</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1648164336000176128</td>\n",
       "      <td>1648164335723352064</td>\n",
       "      <td>/admin/8fa85631-bfc8-7a0c-ad73-1cf6355aac59/8f...</td>\n",
       "      <td>森羽网络</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>50-100人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>370000.0</td>\n",
       "      <td>370800.0</td>\n",
       "      <td>370811.0</td>\n",
       "      <td>山东省济宁市高新区洸河街道百丰大厦803</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>558</th>\n",
       "      <td>1505828901006475264</td>\n",
       "      <td>1505827247335014400</td>\n",
       "      <td>/佛山市小羽科技有限公司/4154-16478520jpg/16478520.jpg</td>\n",
       "      <td>佛山小羽智能科技</td>\n",
       "      <td>[\"电子商务\",\"互联网\"]</td>\n",
       "      <td>合资</td>\n",
       "      <td>NaN</td>\n",
       "      <td>200-500万</td>\n",
       "      <td>50-100人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>440000.0</td>\n",
       "      <td>440600.0</td>\n",
       "      <td>440605.0</td>\n",
       "      <td>狮山镇软件园桃园路南海产业智库城一期B座B611室之二(住所申报)</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>559</th>\n",
       "      <td>1476026480080322653</td>\n",
       "      <td>1476026460258041857</td>\n",
       "      <td>NaN</td>\n",
       "      <td>朗新科技</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>合资</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>50-100人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>560</th>\n",
       "      <td>1491292954646675456</td>\n",
       "      <td>1491291815121387520</td>\n",
       "      <td>/泰盈科技/10142-logojpg/logo.jpg</td>\n",
       "      <td>泰盈科技</td>\n",
       "      <td>[\"人力资源服务\",\"企业服务\"]</td>\n",
       "      <td>合资</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>150-500人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>440000.0</td>\n",
       "      <td>440600.0</td>\n",
       "      <td>440605.0</td>\n",
       "      <td>桂城街道融和路25号荣耀国际金融中心4层401-408室(住所申报)</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>561</th>\n",
       "      <td>1518408400017424384</td>\n",
       "      <td>1518408399195340800</td>\n",
       "      <td>/91310000734084709Q/1650852186157/165085218615...</td>\n",
       "      <td>鼎捷软件</td>\n",
       "      <td>[\"互联网\",\"金融\"]</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>500-1000人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>广东省广州市</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>562</th>\n",
       "      <td>1493064051390021632</td>\n",
       "      <td>1493063267566878720</td>\n",
       "      <td>/京东信息/43591-164480911png/164480911.png</td>\n",
       "      <td>京东信息</td>\n",
       "      <td>[\"电子商务\",\"互联网\"]</td>\n",
       "      <td>上市公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>10000人以上</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>320000.0</td>\n",
       "      <td>321300.0</td>\n",
       "      <td>321311.0</td>\n",
       "      <td>洪泽湖东路与清水江路交叉口</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>563 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                      id         enterpriseId  \\\n",
       "0    1561969174736142336  1561969173914058752   \n",
       "1    1507193335067049984  1507192012301008896   \n",
       "2    1648526959162036224  1648526958960709632   \n",
       "3    1562373713612111872  1562373712110551040   \n",
       "4    1648164336000176128  1648164335723352064   \n",
       "..                   ...                  ...   \n",
       "558  1505828901006475264  1505827247335014400   \n",
       "559  1476026480080322653  1476026460258041857   \n",
       "560  1491292954646675456  1491291815121387520   \n",
       "561  1518408400017424384  1518408399195340800   \n",
       "562  1493064051390021632  1493063267566878720   \n",
       "\n",
       "                                                  logo shortName  \\\n",
       "0    /91440300MA5GKTA37A/1661238138156/166123813815...      漪畔网络   \n",
       "1    /admin/fd700d14-9b0c-18bc-ae22-3ade75207d54/fd...    海柔创新科技   \n",
       "2    /admin/1e4afb8d-bea7-aeed-a65e-687d6da3a532/1e...       众云网   \n",
       "3    /914403006188644414/1661334068655/166133406865...      中银信息   \n",
       "4    /admin/8fa85631-bfc8-7a0c-ad73-1cf6355aac59/8f...      森羽网络   \n",
       "..                                                 ...       ...   \n",
       "558         /佛山市小羽科技有限公司/4154-16478520jpg/16478520.jpg  佛山小羽智能科技   \n",
       "559                                                NaN      朗新科技   \n",
       "560                       /泰盈科技/10142-logojpg/logo.jpg      泰盈科技   \n",
       "561  /91310000734084709Q/1650852186157/165085218615...      鼎捷软件   \n",
       "562             /京东信息/43591-164480911png/164480911.png      京东信息   \n",
       "\n",
       "              industry econKind  startDate registCapi personScope  \\\n",
       "0        [\"互联网\",\"大数据\"]     民营公司        NaN   100-200万       少于50人   \n",
       "1       [\"电子商务\",\"互联网\"]       合资        NaN    1000万以上    150-500人   \n",
       "2      [\"互联网\",\"计算机软件\"]     民营公司        NaN        NaN    150-500人   \n",
       "3        [\"互联网\",\"大数据\"]     民营公司        NaN    1000万以上    150-500人   \n",
       "4              [\"互联网\"]     民营公司        NaN        NaN     50-100人   \n",
       "..                 ...      ...        ...        ...         ...   \n",
       "558     [\"电子商务\",\"互联网\"]       合资        NaN   200-500万     50-100人   \n",
       "559            [\"互联网\"]       合资        NaN        NaN     50-100人   \n",
       "560  [\"人力资源服务\",\"企业服务\"]       合资        NaN    1000万以上    150-500人   \n",
       "561       [\"互联网\",\"金融\"]     民营公司        NaN    1000万以上   500-1000人   \n",
       "562     [\"电子商务\",\"互联网\"]     上市公司        NaN    1000万以上    10000人以上   \n",
       "\n",
       "                       website  ... photo label  postCode recruitJobNum  \\\n",
       "0                          NaN  ...   NaN   NaN       NaN             3   \n",
       "1                          NaN  ...   NaN   NaN       NaN             3   \n",
       "2    http://www.ouryun.com.cn/  ...   NaN    []       NaN             3   \n",
       "3                          NaN  ...   NaN   NaN       NaN             2   \n",
       "4                          NaN  ...   NaN    []       NaN             4   \n",
       "..                         ...  ...   ...   ...       ...           ...   \n",
       "558                        NaN  ...   NaN   NaN       NaN             5   \n",
       "559                        NaN  ...   NaN   NaN       NaN             2   \n",
       "560                        NaN  ...   NaN   NaN       NaN             2   \n",
       "561                        NaN  ...   NaN   NaN       NaN             1   \n",
       "562                        NaN  ...   NaN   NaN       NaN             1   \n",
       "\n",
       "    totalPublicJobNum provinceCode  cityCode  regionCode  \\\n",
       "0                   3          NaN       NaN         NaN   \n",
       "1                   3     440000.0  440300.0    440306.0   \n",
       "2                   3     440000.0  440300.0    440305.0   \n",
       "3                   2          NaN       NaN         NaN   \n",
       "4                   4     370000.0  370800.0    370811.0   \n",
       "..                ...          ...       ...         ...   \n",
       "558                 5     440000.0  440600.0    440605.0   \n",
       "559                 2          NaN       NaN         NaN   \n",
       "560                 2     440000.0  440600.0    440605.0   \n",
       "561                 1          NaN       NaN         NaN   \n",
       "562                 1     320000.0  321300.0    321311.0   \n",
       "\n",
       "                            detailedAddress  remarks  \n",
       "0                                    广东省广州市      NaN  \n",
       "1              西乡街道南昌社区安络科技产业园B栋201、301、401        0  \n",
       "2    深圳市南山区粤海街道滨海社区海天一路19、17、18号软件产业基地4栋511      NaN  \n",
       "3                                    广东省广州市      NaN  \n",
       "4                      山东省济宁市高新区洸河街道百丰大厦803      NaN  \n",
       "..                                      ...      ...  \n",
       "558       狮山镇软件园桃园路南海产业智库城一期B座B611室之二(住所申报)        0  \n",
       "559                                     NaN      NaN  \n",
       "560      桂城街道融和路25号荣耀国际金融中心4层401-408室(住所申报)        0  \n",
       "561                                  广东省广州市      NaN  \n",
       "562                           洪泽湖东路与清水江路交叉口        0  \n",
       "\n",
       "[563 rows x 24 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "CompanyDetail"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.标签删除"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1. 删除id————只需要企业id\n",
    "2. startDate全是NaN，删除，代表的创建日期？\n",
    "3. website全是NaN，删除\n",
    "4. photo全是空，删除\n",
    "5. label全是空，删除\n",
    "6. postCode全是空，删除\n",
    "7. slogan全是空，删除\n",
    "8. website全是空，删除\n",
    "9. logo不需要\n",
    "10. email、phone不需要，进行删除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "http://www.ouryun.com.cn/    1\n",
       "Name: website, dtype: int64"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "CompanyDetail['startDate'].value_counts()\n",
    "CompanyDetail['website'].value_counts()\n",
    "CompanyDetail['photo'].value_counts()\n",
    "CompanyDetail['label'].value_counts()\n",
    "CompanyDetail['postCode'].value_counts()\n",
    "CompanyDetail['slogan'].value_counts()\n",
    "CompanyDetail['website'].value_counts() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>enterpriseId</th>\n",
       "      <th>shortName</th>\n",
       "      <th>industry</th>\n",
       "      <th>econKind</th>\n",
       "      <th>registCapi</th>\n",
       "      <th>personScope</th>\n",
       "      <th>introduction</th>\n",
       "      <th>recruitJobNum</th>\n",
       "      <th>totalPublicJobNum</th>\n",
       "      <th>provinceCode</th>\n",
       "      <th>cityCode</th>\n",
       "      <th>regionCode</th>\n",
       "      <th>detailedAddress</th>\n",
       "      <th>remarks</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1561969173914058752</td>\n",
       "      <td>漪畔网络</td>\n",
       "      <td>[\"互联网\",\"大数据\"]</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>100-200万</td>\n",
       "      <td>少于50人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>广东省广州市</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1507192012301008896</td>\n",
       "      <td>海柔创新科技</td>\n",
       "      <td>[\"电子商务\",\"互联网\"]</td>\n",
       "      <td>合资</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>150-500人</td>\n",
       "      <td>深圳市海柔创新科技有限公司(HAI ROBOTICS CO., LTD.)，始于2016年深...</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>440000.0</td>\n",
       "      <td>440300.0</td>\n",
       "      <td>440306.0</td>\n",
       "      <td>西乡街道南昌社区安络科技产业园B栋201、301、401</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1648526958960709632</td>\n",
       "      <td>众云网</td>\n",
       "      <td>[\"互联网\",\"计算机软件\"]</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>150-500人</td>\n",
       "      <td>深圳市众云网有限公司（简称：众云网）成立于2014年，注册资金3500万元，(2021年)人...</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>440000.0</td>\n",
       "      <td>440300.0</td>\n",
       "      <td>440305.0</td>\n",
       "      <td>深圳市南山区粤海街道滨海社区海天一路19、17、18号软件产业基地4栋511</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1562373712110551040</td>\n",
       "      <td>中银信息</td>\n",
       "      <td>[\"互联网\",\"大数据\"]</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>150-500人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>广东省广州市</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1648164335723352064</td>\n",
       "      <td>森羽网络</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>50-100人</td>\n",
       "      <td>济宁森羽网络科技有限公司成立于2018-01-18，法定代表人为杜佳昕，注册资本为500万元...</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>370000.0</td>\n",
       "      <td>370800.0</td>\n",
       "      <td>370811.0</td>\n",
       "      <td>山东省济宁市高新区洸河街道百丰大厦803</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>558</th>\n",
       "      <td>1505827247335014400</td>\n",
       "      <td>佛山小羽智能科技</td>\n",
       "      <td>[\"电子商务\",\"互联网\"]</td>\n",
       "      <td>合资</td>\n",
       "      <td>200-500万</td>\n",
       "      <td>50-100人</td>\n",
       "      <td>佛山市小羽智能科技有限公司成立于2021-01-18，法定代表人为徐化夷，注册资本为444....</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>440000.0</td>\n",
       "      <td>440600.0</td>\n",
       "      <td>440605.0</td>\n",
       "      <td>狮山镇软件园桃园路南海产业智库城一期B座B611室之二(住所申报)</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>559</th>\n",
       "      <td>1476026460258041857</td>\n",
       "      <td>朗新科技</td>\n",
       "      <td>[\"互联网\"]</td>\n",
       "      <td>合资</td>\n",
       "      <td>NaN</td>\n",
       "      <td>50-100人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>560</th>\n",
       "      <td>1491291815121387520</td>\n",
       "      <td>泰盈科技</td>\n",
       "      <td>[\"人力资源服务\",\"企业服务\"]</td>\n",
       "      <td>合资</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>150-500人</td>\n",
       "      <td>广东泰盈科技有限公司成立于2018-09-06，法定代表人为王志利，注册资本为1000万元...</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>440000.0</td>\n",
       "      <td>440600.0</td>\n",
       "      <td>440605.0</td>\n",
       "      <td>桂城街道融和路25号荣耀国际金融中心4层401-408室(住所申报)</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>561</th>\n",
       "      <td>1518408399195340800</td>\n",
       "      <td>鼎捷软件</td>\n",
       "      <td>[\"互联网\",\"金融\"]</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>500-1000人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>广东省广州市</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>562</th>\n",
       "      <td>1493063267566878720</td>\n",
       "      <td>京东信息</td>\n",
       "      <td>[\"电子商务\",\"互联网\"]</td>\n",
       "      <td>上市公司</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>10000人以上</td>\n",
       "      <td>江苏京东信息技术有限公司成立于2009-06-16，法定代表人为曹珂，注册资本为2000万...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>320000.0</td>\n",
       "      <td>321300.0</td>\n",
       "      <td>321311.0</td>\n",
       "      <td>洪泽湖东路与清水江路交叉口</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>563 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            enterpriseId shortName           industry econKind registCapi  \\\n",
       "0    1561969173914058752      漪畔网络      [\"互联网\",\"大数据\"]     民营公司   100-200万   \n",
       "1    1507192012301008896    海柔创新科技     [\"电子商务\",\"互联网\"]       合资    1000万以上   \n",
       "2    1648526958960709632       众云网    [\"互联网\",\"计算机软件\"]     民营公司        NaN   \n",
       "3    1562373712110551040      中银信息      [\"互联网\",\"大数据\"]     民营公司    1000万以上   \n",
       "4    1648164335723352064      森羽网络            [\"互联网\"]     民营公司        NaN   \n",
       "..                   ...       ...                ...      ...        ...   \n",
       "558  1505827247335014400  佛山小羽智能科技     [\"电子商务\",\"互联网\"]       合资   200-500万   \n",
       "559  1476026460258041857      朗新科技            [\"互联网\"]       合资        NaN   \n",
       "560  1491291815121387520      泰盈科技  [\"人力资源服务\",\"企业服务\"]       合资    1000万以上   \n",
       "561  1518408399195340800      鼎捷软件       [\"互联网\",\"金融\"]     民营公司    1000万以上   \n",
       "562  1493063267566878720      京东信息     [\"电子商务\",\"互联网\"]     上市公司    1000万以上   \n",
       "\n",
       "    personScope                                       introduction  \\\n",
       "0         少于50人                                                NaN   \n",
       "1      150-500人  深圳市海柔创新科技有限公司(HAI ROBOTICS CO., LTD.)，始于2016年深...   \n",
       "2      150-500人  深圳市众云网有限公司（简称：众云网）成立于2014年，注册资金3500万元，(2021年)人...   \n",
       "3      150-500人                                                NaN   \n",
       "4       50-100人  济宁森羽网络科技有限公司成立于2018-01-18，法定代表人为杜佳昕，注册资本为500万元...   \n",
       "..          ...                                                ...   \n",
       "558     50-100人  佛山市小羽智能科技有限公司成立于2021-01-18，法定代表人为徐化夷，注册资本为444....   \n",
       "559     50-100人                                                NaN   \n",
       "560    150-500人   广东泰盈科技有限公司成立于2018-09-06，法定代表人为王志利，注册资本为1000万元...   \n",
       "561   500-1000人                                                NaN   \n",
       "562    10000人以上   江苏京东信息技术有限公司成立于2009-06-16，法定代表人为曹珂，注册资本为2000万...   \n",
       "\n",
       "     recruitJobNum  totalPublicJobNum  provinceCode  cityCode  regionCode  \\\n",
       "0                3                  3           NaN       NaN         NaN   \n",
       "1                3                  3      440000.0  440300.0    440306.0   \n",
       "2                3                  3      440000.0  440300.0    440305.0   \n",
       "3                2                  2           NaN       NaN         NaN   \n",
       "4                4                  4      370000.0  370800.0    370811.0   \n",
       "..             ...                ...           ...       ...         ...   \n",
       "558              5                  5      440000.0  440600.0    440605.0   \n",
       "559              2                  2           NaN       NaN         NaN   \n",
       "560              2                  2      440000.0  440600.0    440605.0   \n",
       "561              1                  1           NaN       NaN         NaN   \n",
       "562              1                  1      320000.0  321300.0    321311.0   \n",
       "\n",
       "                            detailedAddress remarks  \n",
       "0                                    广东省广州市     NaN  \n",
       "1              西乡街道南昌社区安络科技产业园B栋201、301、401       0  \n",
       "2    深圳市南山区粤海街道滨海社区海天一路19、17、18号软件产业基地4栋511     NaN  \n",
       "3                                    广东省广州市     NaN  \n",
       "4                      山东省济宁市高新区洸河街道百丰大厦803     NaN  \n",
       "..                                      ...     ...  \n",
       "558       狮山镇软件园桃园路南海产业智库城一期B座B611室之二(住所申报)       0  \n",
       "559                                     NaN     NaN  \n",
       "560      桂城街道融和路25号荣耀国际金融中心4层401-408室(住所申报)       0  \n",
       "561                                  广东省广州市     NaN  \n",
       "562                           洪泽湖东路与清水江路交叉口       0  \n",
       "\n",
       "[563 rows x 14 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "del_columns = [\"id\",\"logo\",\"startDate\",'website','photo','label','postCode','slogan','website',\"email\",\"phone\"]\n",
    "CompanyDetail = CompanyDetail[[i for i in CompanyDetail.columns if i not in del_columns]]\n",
    "CompanyDetail"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.省份、城市、区映射"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import requests\n",
    "# # 代码爬取\n",
    "# r = requests.get(\"https://www.5iai.com/api/dict/data/public/list/tree?dictType=region_code\",proxies={\"https\":\"http:localhost:7890\"}).json()\n",
    "# ProvinceDic = {}\n",
    "# CityDic = {}\n",
    "# RegionDic = {}\n",
    "\n",
    "# for province in r['data']:\n",
    "#     ProvinceDic[province['label']] = province['value']\n",
    "#     for city in province['children']:\n",
    "#         CityDic[city['label']] = city['value']\n",
    "#         for religon in city['children']:\n",
    "#             RegionDic[religon['label']] = religon['value']   \n",
    "\n",
    "# # 持久化存储\n",
    "# import pickle\n",
    "# with open(\"./data/ProvinceDic.pickle\", \"wb\") as f:\n",
    "#     pickle.dump(ProvinceDic, f)\n",
    "# with open(\"./data/CityDic.pickle\", \"wb\") as f:\n",
    "#     pickle.dump(CityDic, f)\n",
    "# with open(\"./data/RegionDic.pickle\", \"wb\") as f:\n",
    "#     pickle.dump(RegionDic, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "with open(\"../Data/OriginData/ProvinceDic.pickle\",\"rb\") as f:\n",
    "    ProvinceDic = pickle.load(f)\n",
    "with open(\"../Data/OriginData/CityDic.pickle\",\"rb\") as f:\n",
    "    CityDic = pickle.load(f)\n",
    "with open(\"../Data/OriginData/RegionDic.pickle\",\"rb\") as f:\n",
    "    RegionDic = pickle.load(f)\n",
    "CompanyDetail['provinceCode'] = CompanyDetail['provinceCode'].map(utils.ReverseDic(ProvinceDic,is_float=True))\n",
    "CompanyDetail['cityCode'] = CompanyDetail['cityCode'].map(utils.ReverseDic(CityDic,is_float=True))\n",
    "CompanyDetail['regionCode'] = CompanyDetail['regionCode'].map(utils.ReverseDic(RegionDic,is_float=True))"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.industry 去除一些非法字符"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f(x):\n",
    "    x = x.replace(\"[\",\"\")\n",
    "    x = x.replace(\"]\",\"\")\n",
    "    x = x.replace(\"\\\"\",\"\")\n",
    "    x = x.split(\",\")\n",
    "    ans = \"\"\n",
    "    if len(x)>1:\n",
    "        for i in range(len(x)):\n",
    "            if i!= len(x)-1:\n",
    "                ans+=x[i]+\"|\"\n",
    "            else:\n",
    "                ans += x[i]\n",
    "    else:\n",
    "        ans += x[0]\n",
    "    return ans\n",
    "CompanyDetail['industry'] = CompanyDetail['industry'].map(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>enterpriseId</th>\n",
       "      <th>shortName</th>\n",
       "      <th>industry</th>\n",
       "      <th>econKind</th>\n",
       "      <th>registCapi</th>\n",
       "      <th>personScope</th>\n",
       "      <th>introduction</th>\n",
       "      <th>recruitJobNum</th>\n",
       "      <th>totalPublicJobNum</th>\n",
       "      <th>provinceCode</th>\n",
       "      <th>cityCode</th>\n",
       "      <th>regionCode</th>\n",
       "      <th>detailedAddress</th>\n",
       "      <th>remarks</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1561969173914058752</td>\n",
       "      <td>漪畔网络</td>\n",
       "      <td>互联网|大数据</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>100-200万</td>\n",
       "      <td>少于50人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>广东省广州市</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1507192012301008896</td>\n",
       "      <td>海柔创新科技</td>\n",
       "      <td>电子商务|互联网</td>\n",
       "      <td>合资</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>150-500人</td>\n",
       "      <td>深圳市海柔创新科技有限公司(HAI ROBOTICS CO., LTD.)，始于2016年深...</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>广东省</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>宝安区</td>\n",
       "      <td>西乡街道南昌社区安络科技产业园B栋201、301、401</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1648526958960709632</td>\n",
       "      <td>众云网</td>\n",
       "      <td>互联网|计算机软件</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>150-500人</td>\n",
       "      <td>深圳市众云网有限公司（简称：众云网）成立于2014年，注册资金3500万元，(2021年)人...</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>广东省</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>南山区</td>\n",
       "      <td>深圳市南山区粤海街道滨海社区海天一路19、17、18号软件产业基地4栋511</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1562373712110551040</td>\n",
       "      <td>中银信息</td>\n",
       "      <td>互联网|大数据</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>150-500人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>广东省广州市</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1648164335723352064</td>\n",
       "      <td>森羽网络</td>\n",
       "      <td>互联网</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>NaN</td>\n",
       "      <td>50-100人</td>\n",
       "      <td>济宁森羽网络科技有限公司成立于2018-01-18，法定代表人为杜佳昕，注册资本为500万元...</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>山东省</td>\n",
       "      <td>济宁市</td>\n",
       "      <td>任城区</td>\n",
       "      <td>山东省济宁市高新区洸河街道百丰大厦803</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>558</th>\n",
       "      <td>1505827247335014400</td>\n",
       "      <td>佛山小羽智能科技</td>\n",
       "      <td>电子商务|互联网</td>\n",
       "      <td>合资</td>\n",
       "      <td>200-500万</td>\n",
       "      <td>50-100人</td>\n",
       "      <td>佛山市小羽智能科技有限公司成立于2021-01-18，法定代表人为徐化夷，注册资本为444....</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>广东省</td>\n",
       "      <td>佛山市</td>\n",
       "      <td>南海区</td>\n",
       "      <td>狮山镇软件园桃园路南海产业智库城一期B座B611室之二(住所申报)</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>559</th>\n",
       "      <td>1476026460258041857</td>\n",
       "      <td>朗新科技</td>\n",
       "      <td>互联网</td>\n",
       "      <td>合资</td>\n",
       "      <td>NaN</td>\n",
       "      <td>50-100人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>560</th>\n",
       "      <td>1491291815121387520</td>\n",
       "      <td>泰盈科技</td>\n",
       "      <td>人力资源服务|企业服务</td>\n",
       "      <td>合资</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>150-500人</td>\n",
       "      <td>广东泰盈科技有限公司成立于2018-09-06，法定代表人为王志利，注册资本为1000万元...</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>广东省</td>\n",
       "      <td>佛山市</td>\n",
       "      <td>南海区</td>\n",
       "      <td>桂城街道融和路25号荣耀国际金融中心4层401-408室(住所申报)</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>561</th>\n",
       "      <td>1518408399195340800</td>\n",
       "      <td>鼎捷软件</td>\n",
       "      <td>互联网|金融</td>\n",
       "      <td>民营公司</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>500-1000人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>广东省广州市</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>562</th>\n",
       "      <td>1493063267566878720</td>\n",
       "      <td>京东信息</td>\n",
       "      <td>电子商务|互联网</td>\n",
       "      <td>上市公司</td>\n",
       "      <td>1000万以上</td>\n",
       "      <td>10000人以上</td>\n",
       "      <td>江苏京东信息技术有限公司成立于2009-06-16，法定代表人为曹珂，注册资本为2000万...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>江苏省</td>\n",
       "      <td>宿迁市</td>\n",
       "      <td>宿豫区</td>\n",
       "      <td>洪泽湖东路与清水江路交叉口</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>563 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            enterpriseId shortName     industry econKind registCapi  \\\n",
       "0    1561969173914058752      漪畔网络      互联网|大数据     民营公司   100-200万   \n",
       "1    1507192012301008896    海柔创新科技     电子商务|互联网       合资    1000万以上   \n",
       "2    1648526958960709632       众云网    互联网|计算机软件     民营公司        NaN   \n",
       "3    1562373712110551040      中银信息      互联网|大数据     民营公司    1000万以上   \n",
       "4    1648164335723352064      森羽网络          互联网     民营公司        NaN   \n",
       "..                   ...       ...          ...      ...        ...   \n",
       "558  1505827247335014400  佛山小羽智能科技     电子商务|互联网       合资   200-500万   \n",
       "559  1476026460258041857      朗新科技          互联网       合资        NaN   \n",
       "560  1491291815121387520      泰盈科技  人力资源服务|企业服务       合资    1000万以上   \n",
       "561  1518408399195340800      鼎捷软件       互联网|金融     民营公司    1000万以上   \n",
       "562  1493063267566878720      京东信息     电子商务|互联网     上市公司    1000万以上   \n",
       "\n",
       "    personScope                                       introduction  \\\n",
       "0         少于50人                                                NaN   \n",
       "1      150-500人  深圳市海柔创新科技有限公司(HAI ROBOTICS CO., LTD.)，始于2016年深...   \n",
       "2      150-500人  深圳市众云网有限公司（简称：众云网）成立于2014年，注册资金3500万元，(2021年)人...   \n",
       "3      150-500人                                                NaN   \n",
       "4       50-100人  济宁森羽网络科技有限公司成立于2018-01-18，法定代表人为杜佳昕，注册资本为500万元...   \n",
       "..          ...                                                ...   \n",
       "558     50-100人  佛山市小羽智能科技有限公司成立于2021-01-18，法定代表人为徐化夷，注册资本为444....   \n",
       "559     50-100人                                                NaN   \n",
       "560    150-500人   广东泰盈科技有限公司成立于2018-09-06，法定代表人为王志利，注册资本为1000万元...   \n",
       "561   500-1000人                                                NaN   \n",
       "562    10000人以上   江苏京东信息技术有限公司成立于2009-06-16，法定代表人为曹珂，注册资本为2000万...   \n",
       "\n",
       "     recruitJobNum  totalPublicJobNum provinceCode cityCode regionCode  \\\n",
       "0                3                  3          NaN      NaN        NaN   \n",
       "1                3                  3          广东省      深圳市        宝安区   \n",
       "2                3                  3          广东省      深圳市        南山区   \n",
       "3                2                  2          NaN      NaN        NaN   \n",
       "4                4                  4          山东省      济宁市        任城区   \n",
       "..             ...                ...          ...      ...        ...   \n",
       "558              5                  5          广东省      佛山市        南海区   \n",
       "559              2                  2          NaN      NaN        NaN   \n",
       "560              2                  2          广东省      佛山市        南海区   \n",
       "561              1                  1          NaN      NaN        NaN   \n",
       "562              1                  1          江苏省      宿迁市        宿豫区   \n",
       "\n",
       "                            detailedAddress remarks  \n",
       "0                                    广东省广州市     NaN  \n",
       "1              西乡街道南昌社区安络科技产业园B栋201、301、401       0  \n",
       "2    深圳市南山区粤海街道滨海社区海天一路19、17、18号软件产业基地4栋511     NaN  \n",
       "3                                    广东省广州市     NaN  \n",
       "4                      山东省济宁市高新区洸河街道百丰大厦803     NaN  \n",
       "..                                      ...     ...  \n",
       "558       狮山镇软件园桃园路南海产业智库城一期B座B611室之二(住所申报)       0  \n",
       "559                                     NaN     NaN  \n",
       "560      桂城街道融和路25号荣耀国际金融中心4层401-408室(住所申报)       0  \n",
       "561                                  广东省广州市     NaN  \n",
       "562                           洪泽湖东路与清水江路交叉口       0  \n",
       "\n",
       "[563 rows x 14 columns]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "CompanyDetail"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 联合处理"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.JobDetail缺失值工作填充，用企业的地址来填充工作的地址"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "33\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "set()"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "temp = JobDetail.merge(Job).merge(CompanyDetail)[[\"jobId\",\"workplace\",\"provinceCode\",\"cityCode\",\"regionCode\"]]        \n",
    "# merge之后Jobid不会在对应位置上对应\n",
    "print((temp.jobId == JobDetail.jobId).sum())\n",
    "\n",
    "set(temp.jobId).difference(JobDetail.jobId)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def fillWorkplace(row):\n",
    "    if not isinstance(row['workplace'],str):\n",
    "        row['workplace'] = row['provinceCode'] if isinstance(row['provinceCode'],str) else \"\"\n",
    "        row['workplace'] += row['cityCode'] if isinstance(row['cityCode'],str) else \"\"\n",
    "        row['workplace'] += row['regionCode'] if isinstance(row['regionCode'],str) else \"\"\n",
    "        return row\n",
    "    else:\n",
    "        row['workplace'] = row['workplace']\n",
    "        return row\n",
    "temp = temp.apply(lambda row:fillWorkplace(row),axis=1)                     # 修改temp\n",
    "JobDetail['workplace2'] = JobDetail['jobId']\n",
    "JobDetail['workplace2'] = JobDetail['workplace2'].map(dict(zip(temp.jobId,temp.workplace)))     # 进行字典映射，不会出现数据不一致情况\n",
    "del JobDetail['workplace']  \n",
    "JobDetail['workplace2'].value_counts().sum() == len(Job)            # 不存在缺失值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "JobDetail[\"workplace2\"] = JobDetail[\"workplace2\"].map(lambda x:\"广州\" if x== \"\" else x)         # 众数填充空字符"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f(x):\n",
    "    import re\n",
    "    pattern = r\"(.+市).+区\"\n",
    "    result = re.sub(pattern, r\"\\1\", x)\n",
    "    result = re.sub(r\"(.+广州市).*\",r\"\\1\", result)\n",
    "    result = re.sub(r\"(广州).*\",r\"\\1\", result)\n",
    "    if result==\"广州\":\n",
    "        return \"广东省广州市\"\n",
    "    else:\n",
    "        return result\n",
    "def f2(x):\n",
    "    if \"北京\" in x:\n",
    "        x =  \"北京省北京市\"\n",
    "    elif \"上海\" in x:\n",
    "        x = \"上海省上海市\"\n",
    "    elif \"天津\" in x:\n",
    "        x = \"天津省天津市\"\n",
    "    elif \"重庆\" in x:\n",
    "        x = \"重庆省重庆市\"\n",
    "    elif len(x) == 4:\n",
    "        x = x[:2] + \"省\" + x[2:] + \"市\"\n",
    "    \n",
    "    \n",
    "    if \"省\" in x:\n",
    "        x = x.split(\"省\")\n",
    "        x[0] += \"省\"\n",
    "        if \"市\" not in x[1]:x[1] += \"市\"\n",
    "        if \"东莞市东莞\" in x[1]:x[1] = \"东莞市\"\n",
    "        return \"|\".join(x)\n",
    "    elif \" \" in x:\n",
    "        x = x.split(\" \")\n",
    "        if \"省\" not in x[0]:x[0] += \"省\"\n",
    "        if \"市\" not in x[1]:x[1] += \"市\"\n",
    "        return \"|\".join(x)\n",
    "    return x\n",
    "    \n",
    "JobDetail['workplace2'] = JobDetail['workplace2'].map(f).map(f2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "JobDetail['workplace2']\n",
    "split_df = JobDetail['workplace2'].str.split(\"|\",expand=True)\n",
    "split_df = split_df.rename(columns={0:\"province\",1:\"city\"})\n",
    "JobDetail = pd.concat([JobDetail, split_df],axis=1)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 汇总"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "Job.to_csv(\"../Data/ProcessData/Job.csv\",index=None,encoding='utf-8')\n",
    "JobDetail.to_csv(\"../Data/ProcessData/JobDetail.csv\",index=None,encoding='utf-8')\n",
    "CompanyDetail.to_csv(\"../Data/ProcessData/CompanyDetail.csv\",index=None,encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
